diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index c01b9e45a32b..676e585a6b53 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -1,3461 +1,3461 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1998 Matthew Dillon,
  * Copyright (c) 1994 John S. Dyson
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *				New Swap System
  *				Matthew Dillon
  *
  * Radix Bitmap 'blists'.
  *
  *	- The new swapper uses the new radix bitmap code.  This should scale
  *	  to arbitrarily small or arbitrarily large swap spaces and an almost
  *	  arbitrary degree of fragmentation.
  *
  * Features:
  *
  *	- on the fly reallocation of swap during putpages.  The new system
  *	  does not try to keep previously allocated swap blocks for dirty
  *	  pages.
  *
  *	- on the fly deallocation of swap
  *
  *	- No more garbage collection required.  Unnecessarily allocated swap
  *	  blocks only exist for dirty vm_page_t's now and these are already
  *	  cycled (in a high-load system) by the pager.  We also do on-the-fly
  *	  removal of invalidated swap blocks when a page is destroyed
  *	  or renamed.
  *
  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
  */
 
 #include "opt_vm.h"
 
 #define	EXTERR_CATEGORY		EXTERR_CAT_SWAP
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/blist.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/disk.h>
 #include <sys/disklabel.h>
 #include <sys/eventhandler.h>
 #include <sys/exterrvar.h>
 #include <sys/fcntl.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/pctrie.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
 #include <vm/vm_radix.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <geom/geom.h>
 
 /*
  * MAX_PAGEOUT_CLUSTER must be a power of 2 between 1 and 64.
  * The 64-page limit is due to the radix code (kern/subr_blist.c).
  */
 #ifndef MAX_PAGEOUT_CLUSTER
 #define	MAX_PAGEOUT_CLUSTER	32
 #endif
 
 #if !defined(SWB_NPAGES)
 #define SWB_NPAGES	MAX_PAGEOUT_CLUSTER
 #endif
 
 #define	SWAP_META_PAGES		PCTRIE_COUNT
 
 /*
  * A swblk structure maps each page index within a
  * SWAP_META_PAGES-aligned and sized range to the address of an
  * on-disk swap block (or SWAPBLK_NONE). The collection of these
  * mappings for an entire vm object is implemented as a pc-trie.
  */
 struct swblk {
 	vm_pindex_t	p;
 	daddr_t		d[SWAP_META_PAGES];
 };
 
 /*
  * A page_range structure records the start address and length of a sequence of
  * mapped page addresses.
  */
 struct page_range {
 	daddr_t start;
 	daddr_t num;
 };
 
 static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data");
 static struct mtx sw_dev_mtx;
 static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq);
 static struct swdevt *swdevhd;	/* Allocate from here next */
 static int nswapdev;		/* Number of swap devices */
 int swap_pager_avail;
 static struct sx swdev_syscall_lock;	/* serialize swap(on|off) */
 
 static __exclusive_cache_line u_long swap_reserved;
 static u_long swap_total;
 static int sysctl_page_shift(SYSCTL_HANDLER_ARGS);
 
 static SYSCTL_NODE(_vm_stats, OID_AUTO, swap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "VM swap stats");
 
 SYSCTL_PROC(_vm, OID_AUTO, swap_reserved, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
     &swap_reserved, 0, sysctl_page_shift, "QU",
     "Amount of swap storage needed to back all allocated anonymous memory.");
 SYSCTL_PROC(_vm, OID_AUTO, swap_total, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
     &swap_total, 0, sysctl_page_shift, "QU",
     "Total amount of available swap storage.");
 
 int vm_overcommit __read_mostly = 0;
 SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &vm_overcommit, 0,
     "Configure virtual memory overcommit behavior. See tuning(7) "
     "for details.");
 static unsigned long swzone;
 SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0,
     "Actual size of swap metadata zone");
 static unsigned long swap_maxpages;
 SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0,
     "Maximum amount of swap supported");
 
 static COUNTER_U64_DEFINE_EARLY(swap_free_deferred);
 SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_deferred,
     CTLFLAG_RD, &swap_free_deferred,
     "Number of pages that deferred freeing swap space");
 
 static COUNTER_U64_DEFINE_EARLY(swap_free_completed);
 SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_completed,
     CTLFLAG_RD, &swap_free_completed,
     "Number of deferred frees completed");
 
 static int
 sysctl_page_shift(SYSCTL_HANDLER_ARGS)
 {
 	uint64_t newval;
 	u_long value = *(u_long *)arg1;
 
 	newval = ((uint64_t)value) << PAGE_SHIFT;
 	return (sysctl_handle_64(oidp, &newval, 0, req));
 }
 
 static bool
 swap_reserve_by_cred_rlimit(u_long pincr, struct ucred *cred, int oc)
 {
 	struct uidinfo *uip;
 	u_long prev;
 
 	uip = cred->cr_ruidinfo;
 
 	prev = atomic_fetchadd_long(&uip->ui_vmsize, pincr);
 	if ((oc & SWAP_RESERVE_RLIMIT_ON) != 0 &&
 	    prev + pincr > lim_cur(curthread, RLIMIT_SWAP) &&
 	    priv_check(curthread, PRIV_VM_SWAP_NORLIMIT) != 0) {
 		prev = atomic_fetchadd_long(&uip->ui_vmsize, -pincr);
 		KASSERT(prev >= pincr,
 		    ("negative vmsize for uid %d\n", uip->ui_uid));
 		return (false);
 	}
 	return (true);
 }
 
 static void
 swap_release_by_cred_rlimit(u_long pdecr, struct ucred *cred)
 {
 	struct uidinfo *uip;
 #ifdef INVARIANTS
 	u_long prev;
 #endif
 
 	uip = cred->cr_ruidinfo;
 
 #ifdef INVARIANTS
 	prev = atomic_fetchadd_long(&uip->ui_vmsize, -pdecr);
 	KASSERT(prev >= pdecr,
 	    ("negative vmsize for uid %d\n", uip->ui_uid));
 #else
 	atomic_subtract_long(&uip->ui_vmsize, pdecr);
 #endif
 }
 
 static void
 swap_reserve_force_rlimit(u_long pincr, struct ucred *cred)
 {
 	struct uidinfo *uip;
 
 	uip = cred->cr_ruidinfo;
 	atomic_add_long(&uip->ui_vmsize, pincr);
 }
 
 bool
 swap_reserve(vm_ooffset_t incr)
 {
 
 	return (swap_reserve_by_cred(incr, curthread->td_ucred));
 }
 
 bool
 swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred)
 {
 	u_long r, s, prev, pincr;
 #ifdef RACCT
 	int error;
 #endif
 	int oc;
 	static int curfail;
 	static struct timeval lastfail;
 
 	KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK",
 	    __func__, (uintmax_t)incr));
 
 #ifdef RACCT
 	if (RACCT_ENABLED()) {
 		PROC_LOCK(curproc);
 		error = racct_add(curproc, RACCT_SWAP, incr);
 		PROC_UNLOCK(curproc);
 		if (error != 0)
 			return (false);
 	}
 #endif
 
 	pincr = atop(incr);
 	prev = atomic_fetchadd_long(&swap_reserved, pincr);
 	r = prev + pincr;
 	s = swap_total;
 	oc = atomic_load_int(&vm_overcommit);
 	if (r > s && (oc & SWAP_RESERVE_ALLOW_NONWIRED) != 0) {
 		s += vm_cnt.v_page_count - vm_cnt.v_free_reserved -
 		    vm_wire_count();
 	}
 	if ((oc & SWAP_RESERVE_FORCE_ON) != 0 && r > s &&
 	    priv_check(curthread, PRIV_VM_SWAP_NOQUOTA) != 0) {
 		prev = atomic_fetchadd_long(&swap_reserved, -pincr);
 		KASSERT(prev >= pincr,
 		    ("swap_reserved < incr on overcommit fail"));
 		goto out_error;
 	}
 
 	if (!swap_reserve_by_cred_rlimit(pincr, cred, oc)) {
 		prev = atomic_fetchadd_long(&swap_reserved, -pincr);
 		KASSERT(prev >= pincr,
 		    ("swap_reserved < incr on overcommit fail"));
 		goto out_error;
 	}
 
 	return (true);
 
 out_error:
 	if (ppsratecheck(&lastfail, &curfail, 1)) {
 		printf("uid %d, pid %d: swap reservation "
 		    "for %jd bytes failed\n",
 		    cred->cr_ruidinfo->ui_uid, curproc->p_pid, incr);
 	}
 #ifdef RACCT
 	if (RACCT_ENABLED()) {
 		PROC_LOCK(curproc);
 		racct_sub(curproc, RACCT_SWAP, incr);
 		PROC_UNLOCK(curproc);
 	}
 #endif
 
 	return (false);
 }
 
 void
 swap_reserve_force(vm_ooffset_t incr)
 {
 	u_long pincr;
 
 	KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK",
 	    __func__, (uintmax_t)incr));
 
 #ifdef RACCT
 	if (RACCT_ENABLED()) {
 		PROC_LOCK(curproc);
 		racct_add_force(curproc, RACCT_SWAP, incr);
 		PROC_UNLOCK(curproc);
 	}
 #endif
 	pincr = atop(incr);
 	atomic_add_long(&swap_reserved, pincr);
 	swap_reserve_force_rlimit(pincr, curthread->td_ucred);
 }
 
 void
 swap_release(vm_ooffset_t decr)
 {
 	struct ucred *cred;
 
 	PROC_LOCK(curproc);
 	cred = curproc->p_ucred;
 	swap_release_by_cred(decr, cred);
 	PROC_UNLOCK(curproc);
 }
 
 void
 swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
 {
 	u_long pdecr;
 #ifdef INVARIANTS
 	u_long prev;
 #endif
 
 	KASSERT((decr & PAGE_MASK) == 0, ("%s: decr: %ju & PAGE_MASK",
 	    __func__, (uintmax_t)decr));
 
 	pdecr = atop(decr);
 #ifdef INVARIANTS
 	prev = atomic_fetchadd_long(&swap_reserved, -pdecr);
 	KASSERT(prev >= pdecr, ("swap_reserved < decr"));
 #else
 	atomic_subtract_long(&swap_reserved, pdecr);
 #endif
 
 	swap_release_by_cred_rlimit(pdecr, cred);
 #ifdef RACCT
 	if (racct_enable)
 		racct_sub_cred(cred, RACCT_SWAP, decr);
 #endif
 }
 
 static bool swap_pager_full = true; /* swap space exhaustion (task killing) */
-static bool swap_pager_almost_full = true; /* swap space exhaustion (w/hysteresis) */
+bool swap_pager_almost_full = true; /* swap space exhaustion (w/hysteresis) */
 static struct mtx swbuf_mtx;	/* to sync nsw_wcount_async */
 static int nsw_wcount_async;	/* limit async write buffers */
 static int nsw_wcount_async_max;/* assigned maximum			*/
 int nsw_cluster_max; 		/* maximum VOP I/O allowed		*/
 
 static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I",
     "Maximum running async swap ops");
 static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vm, OID_AUTO, swap_fragmentation, CTLTYPE_STRING | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_fragmentation, "A",
     "Swap Fragmentation Info");
 
 static struct sx sw_alloc_sx;
 
 /*
  * "named" and "unnamed" anon region objects.  Try to reduce the overhead
  * of searching a named list by hashing it just a little.
  */
 
 #define NOBJLISTS		8
 
 #define NOBJLIST(handle)	\
 	(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
 
 static struct pagerlst	swap_pager_object_list[NOBJLISTS];
 static uma_zone_t swwbuf_zone;
 static uma_zone_t swrbuf_zone;
 static uma_zone_t swblk_zone;
 static uma_zone_t swpctrie_zone;
 
 /*
  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
  * calls hooked from other parts of the VM system and do not appear here.
  * (see vm/swap_pager.h).
  */
 static vm_object_t
 		swap_pager_alloc(void *handle, vm_ooffset_t size,
 		    vm_prot_t prot, vm_ooffset_t offset, struct ucred *);
 static void	swap_pager_dealloc(vm_object_t object);
 static int	swap_pager_getpages(vm_object_t, vm_page_t *, int, int *,
     int *);
 static int	swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int *,
     int *, pgo_getpages_iodone_t, void *);
 static void	swap_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
 static boolean_t
 		swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
 static void	swap_pager_init(void);
 static void	swap_pager_unswapped(vm_page_t);
 static void	swap_pager_swapoff(struct swdevt *sp);
 static void	swap_pager_update_writecount(vm_object_t object,
     vm_offset_t start, vm_offset_t end);
 static void	swap_pager_release_writecount(vm_object_t object,
     vm_offset_t start, vm_offset_t end);
 static void	swap_pager_freespace_pgo(vm_object_t object, vm_pindex_t start,
     vm_size_t size);
 
 const struct pagerops swappagerops = {
 	.pgo_kvme_type = KVME_TYPE_SWAP,
 	.pgo_init =	swap_pager_init,	/* early system initialization of pager	*/
 	.pgo_alloc =	swap_pager_alloc,	/* allocate an OBJT_SWAP object */
 	.pgo_dealloc =	swap_pager_dealloc,	/* deallocate an OBJT_SWAP object */
 	.pgo_getpages =	swap_pager_getpages,	/* pagein */
 	.pgo_getpages_async = swap_pager_getpages_async, /* pagein (async) */
 	.pgo_putpages =	swap_pager_putpages,	/* pageout */
 	.pgo_haspage =	swap_pager_haspage,	/* get backing store status for page */
 	.pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */
 	.pgo_update_writecount = swap_pager_update_writecount,
 	.pgo_release_writecount = swap_pager_release_writecount,
 	.pgo_freespace = swap_pager_freespace_pgo,
 };
 
 /*
  * swap_*() routines are externally accessible.  swp_*() routines are
  * internal.
  */
 static int nswap_lowat = 128;	/* in pages, swap_pager_almost_full warn */
 static int nswap_hiwat = 512;	/* in pages, swap_pager_almost_full warn */
 
 SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &nsw_cluster_max, 0,
     "Maximum size of a swap block in pages");
 
 static void	swp_sizecheck(void);
 static void	swp_pager_async_iodone(struct buf *bp);
 static bool	swp_pager_swblk_empty(struct swblk *sb, int start, int limit);
 static void	swp_pager_free_empty_swblk(vm_object_t, struct swblk *sb);
 static int	swapongeom(struct vnode *);
 static int	swaponvp(struct thread *, struct vnode *, u_long);
 static int	swapoff_one(struct swdevt *sp, struct ucred *cred,
 		    u_int flags);
 
 /*
  * Swap bitmap functions
  */
 static void	swp_pager_freeswapspace(const struct page_range *range);
 static daddr_t	swp_pager_getswapspace(int *npages);
 
 /*
  * Metadata functions
  */
 static daddr_t swp_pager_meta_build(struct pctrie_iter *, vm_object_t object,
 	vm_pindex_t, daddr_t, bool);
 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t,
     vm_size_t *);
 static void swp_pager_meta_transfer(vm_object_t src, vm_object_t dst,
     vm_pindex_t pindex, vm_pindex_t count);
 static void swp_pager_meta_free_all(vm_object_t);
 static daddr_t swp_pager_meta_lookup(struct pctrie_iter *, vm_pindex_t);
 
 static void
 swp_pager_init_freerange(struct page_range *range)
 {
 	range->start = SWAPBLK_NONE;
 	range->num = 0;
 }
 
 static void
 swp_pager_update_freerange(struct page_range *range, daddr_t addr)
 {
 	if (range->start + range->num == addr) {
 		range->num++;
 	} else {
 		swp_pager_freeswapspace(range);
 		range->start = addr;
 		range->num = 1;
 	}
 }
 
 static void *
 swblk_trie_alloc(struct pctrie *ptree)
 {
 
 	return (uma_zalloc(swpctrie_zone, M_NOWAIT | (curproc == pageproc ?
 	    M_USE_RESERVE : 0)));
 }
 
 static void
 swblk_trie_free(struct pctrie *ptree, void *node)
 {
 
 	uma_zfree(swpctrie_zone, node);
 }
 
 static int
 swblk_start(struct swblk *sb, vm_pindex_t pindex)
 {
 	return (sb == NULL || sb->p >= pindex ?
 	    0 : pindex % SWAP_META_PAGES);
 }
 
 PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free);
 
 static struct swblk *
 swblk_lookup(vm_object_t object, vm_pindex_t pindex)
 {
 	return (SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
 	    rounddown(pindex, SWAP_META_PAGES)));
 }
 
 static void
 swblk_lookup_remove(vm_object_t object, struct swblk *sb)
 {
 	SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p);
 }
 
 static bool
 swblk_is_empty(vm_object_t object)
 {
 	return (pctrie_is_empty(&object->un_pager.swp.swp_blks));
 }
 
 static struct swblk *
 swblk_iter_lookup_ge(struct pctrie_iter *blks, vm_pindex_t pindex)
 {
 	return (SWAP_PCTRIE_ITER_LOOKUP_GE(blks,
 	    rounddown(pindex, SWAP_META_PAGES)));
 }
 
 static void
 swblk_iter_init_only(struct pctrie_iter *blks, vm_object_t object)
 {
 	VM_OBJECT_ASSERT_LOCKED(object);
 	MPASS((object->flags & OBJ_SWAP) != 0);
 	pctrie_iter_init(blks, &object->un_pager.swp.swp_blks);
 }
 
 
 static struct swblk *
 swblk_iter_init(struct pctrie_iter *blks, vm_object_t object,
     vm_pindex_t pindex)
 {
 	swblk_iter_init_only(blks, object);
 	return (swblk_iter_lookup_ge(blks, pindex));
 }
 
 static struct swblk *
 swblk_iter_reinit(struct pctrie_iter *blks, vm_object_t object,
     vm_pindex_t pindex)
 {
 	swblk_iter_init_only(blks, object);
 	return (SWAP_PCTRIE_ITER_LOOKUP(blks,
 	    rounddown(pindex, SWAP_META_PAGES)));
 }
 
 static struct swblk *
 swblk_iter_limit_init(struct pctrie_iter *blks, vm_object_t object,
     vm_pindex_t pindex, vm_pindex_t limit)
 {
 	VM_OBJECT_ASSERT_LOCKED(object);
 	MPASS((object->flags & OBJ_SWAP) != 0);
 	pctrie_iter_limit_init(blks, &object->un_pager.swp.swp_blks, limit);
 	return (swblk_iter_lookup_ge(blks, pindex));
 }
 
 static struct swblk *
 swblk_iter_next(struct pctrie_iter *blks)
 {
 	return (SWAP_PCTRIE_ITER_JUMP_GE(blks, SWAP_META_PAGES));
 }
 
 static struct swblk *
 swblk_iter_lookup(struct pctrie_iter *blks, vm_pindex_t pindex)
 {
 	return (SWAP_PCTRIE_ITER_LOOKUP(blks,
 	    rounddown(pindex, SWAP_META_PAGES)));
 }
 
 static int
 swblk_iter_insert(struct pctrie_iter *blks, struct swblk *sb)
 {
 	return (SWAP_PCTRIE_ITER_INSERT(blks, sb));
 }
 
 static void
 swblk_iter_remove(struct pctrie_iter *blks)
 {
 	SWAP_PCTRIE_ITER_REMOVE(blks);
 }
 
 /*
  * SWP_SIZECHECK() -	update swap_pager_full indication
  *
  *	update the swap_pager_almost_full indication and warn when we are
  *	about to run out of swap space, using lowat/hiwat hysteresis.
  *
  *	Clear swap_pager_full ( task killing ) indication when lowat is met.
  *
  *	No restrictions on call
  *	This routine may not block.
  */
 static void
 swp_sizecheck(void)
 {
 
 	if (swap_pager_avail < nswap_lowat) {
 		if (!swap_pager_almost_full) {
 			printf("swap_pager: out of swap space\n");
 			swap_pager_almost_full = true;
 		}
 	} else {
 		swap_pager_full = false;
 		if (swap_pager_avail > nswap_hiwat)
 			swap_pager_almost_full = false;
 	}
 }
 
 /*
  * SWAP_PAGER_INIT() -	initialize the swap pager!
  *
  *	Expected to be started from system init.  NOTE:  This code is run
  *	before much else so be careful what you depend on.  Most of the VM
  *	system has yet to be initialized at this point.
  */
 static void
 swap_pager_init(void)
 {
 	/*
 	 * Initialize object lists
 	 */
 	int i;
 
 	for (i = 0; i < NOBJLISTS; ++i)
 		TAILQ_INIT(&swap_pager_object_list[i]);
 	mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
 	sx_init(&sw_alloc_sx, "swspsx");
 	sx_init(&swdev_syscall_lock, "swsysc");
 
 	/*
 	 * The nsw_cluster_max is constrained by the bp->b_pages[]
 	 * array, which has maxphys / PAGE_SIZE entries, and our locally
 	 * defined MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
 	 * constrained by the swap device interleave stripe size.
 	 *
 	 * Initialized early so that GEOM_ELI can see it.
 	 */
 	nsw_cluster_max = min(maxphys / PAGE_SIZE, MAX_PAGEOUT_CLUSTER);
 }
 
 /*
  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
  *
  *	Expected to be started from pageout process once, prior to entering
  *	its main loop.
  */
 void
 swap_pager_swap_init(void)
 {
 	unsigned long n, n2;
 
 	/*
 	 * Number of in-transit swap bp operations.  Don't
 	 * exhaust the pbufs completely.  Make sure we
 	 * initialize workable values (0 will work for hysteresis
 	 * but it isn't very efficient).
 	 *
 	 * Currently we hardwire nsw_wcount_async to 4.  This limit is
 	 * designed to prevent other I/O from having high latencies due to
 	 * our pageout I/O.  The value 4 works well for one or two active swap
 	 * devices but is probably a little low if you have more.  Even so,
 	 * a higher value would probably generate only a limited improvement
 	 * with three or four active swap devices since the system does not
 	 * typically have to pageout at extreme bandwidths.   We will want
 	 * at least 2 per swap devices, and 4 is a pretty good value if you
 	 * have one NFS swap device due to the command/ack latency over NFS.
 	 * So it all works out pretty well.
 	 *
 	 * nsw_cluster_max is initialized in swap_pager_init().
 	 */
 
 	nsw_wcount_async = 4;
 	nsw_wcount_async_max = nsw_wcount_async;
 	mtx_init(&swbuf_mtx, "async swbuf mutex", NULL, MTX_DEF);
 
 	swwbuf_zone = pbuf_zsecond_create("swwbuf", nswbuf / 4);
 	swrbuf_zone = pbuf_zsecond_create("swrbuf", nswbuf / 2);
 
 	/*
 	 * Initialize our zone, taking the user's requested size or
 	 * estimating the number we need based on the number of pages
 	 * in the system.
 	 */
 	n = maxswzone != 0 ? maxswzone / sizeof(struct swblk) :
 	    vm_cnt.v_page_count / 2;
 	swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL,
 	    pctrie_zone_init, NULL, UMA_ALIGN_PTR, 0);
 	swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL,
 	    NULL, NULL, _Alignof(struct swblk) - 1, 0);
 	n2 = n;
 	do {
 		if (uma_zone_reserve_kva(swblk_zone, n))
 			break;
 		/*
 		 * if the allocation failed, try a zone two thirds the
 		 * size of the previous attempt.
 		 */
 		n -= ((n + 2) / 3);
 	} while (n > 0);
 
 	/*
 	 * Often uma_zone_reserve_kva() cannot reserve exactly the
 	 * requested size.  Account for the difference when
 	 * calculating swap_maxpages.
 	 */
 	n = uma_zone_get_max(swblk_zone);
 
 	if (n < n2)
 		printf("Swap blk zone entries changed from %lu to %lu.\n",
 		    n2, n);
 	/* absolute maximum we can handle assuming 100% efficiency */
 	swap_maxpages = n * SWAP_META_PAGES;
 	swzone = n * sizeof(struct swblk);
 	if (!uma_zone_reserve_kva(swpctrie_zone, n))
 		printf("Cannot reserve swap pctrie zone, "
 		    "reduce kern.maxswzone.\n");
 }
 
 bool
 swap_pager_init_object(vm_object_t object, void *handle, struct ucred *cred,
     vm_ooffset_t size, vm_ooffset_t offset)
 {
 	if (cred != NULL) {
 		if (!swap_reserve_by_cred(size, cred))
 			return (false);
 		crhold(cred);
 	}
 
 	object->un_pager.swp.writemappings = 0;
 	object->handle = handle;
 	if (cred != NULL) {
 		object->cred = cred;
 		object->charge = size;
 	}
 	return (true);
 }
 
 static vm_object_t
 swap_pager_alloc_init(objtype_t otype, void *handle, struct ucred *cred,
     vm_ooffset_t size, vm_ooffset_t offset)
 {
 	vm_object_t object;
 
 	/*
 	 * The un_pager.swp.swp_blks trie is initialized by
 	 * vm_object_allocate() to ensure the correct order of
 	 * visibility to other threads.
 	 */
 	object = vm_object_allocate(otype, OFF_TO_IDX(offset +
 	    PAGE_MASK + size));
 
 	if (!swap_pager_init_object(object, handle, cred, size, offset)) {
 		vm_object_deallocate(object);
 		return (NULL);
 	}
 	return (object);
 }
 
 /*
  * SWAP_PAGER_ALLOC() -	allocate a new OBJT_SWAP VM object and instantiate
  *			its metadata structures.
  *
  *	This routine is called from the mmap and fork code to create a new
  *	OBJT_SWAP object.
  *
  *	This routine must ensure that no live duplicate is created for
  *	the named object request, which is protected against by
  *	holding the sw_alloc_sx lock in case handle != NULL.
  */
 static vm_object_t
 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t offset, struct ucred *cred)
 {
 	vm_object_t object;
 
 	if (handle != NULL) {
 		/*
 		 * Reference existing named region or allocate new one.  There
 		 * should not be a race here against swp_pager_meta_build()
 		 * as called from vm_page_remove() in regards to the lookup
 		 * of the handle.
 		 */
 		sx_xlock(&sw_alloc_sx);
 		object = vm_pager_object_lookup(NOBJLIST(handle), handle);
 		if (object == NULL) {
 			object = swap_pager_alloc_init(OBJT_SWAP, handle, cred,
 			    size, offset);
 			if (object != NULL) {
 				TAILQ_INSERT_TAIL(NOBJLIST(object->handle),
 				    object, pager_object_list);
 			}
 		}
 		sx_xunlock(&sw_alloc_sx);
 	} else {
 		object = swap_pager_alloc_init(OBJT_SWAP, handle, cred,
 		    size, offset);
 	}
 	return (object);
 }
 
 /*
  * SWAP_PAGER_DEALLOC() -	remove swap metadata from object
  *
  *	The swap backing for the object is destroyed.  The code is
  *	designed such that we can reinstantiate it later, but this
  *	routine is typically called only when the entire object is
  *	about to be destroyed.
  *
  *	The object must be locked.
  */
 static void
 swap_pager_dealloc(vm_object_t object)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj"));
 
 	/*
 	 * Remove from list right away so lookups will fail if we block for
 	 * pageout completion.
 	 */
 	if ((object->flags & OBJ_ANON) == 0 && object->handle != NULL) {
 		VM_OBJECT_WUNLOCK(object);
 		sx_xlock(&sw_alloc_sx);
 		TAILQ_REMOVE(NOBJLIST(object->handle), object,
 		    pager_object_list);
 		sx_xunlock(&sw_alloc_sx);
 		VM_OBJECT_WLOCK(object);
 	}
 
 	vm_object_pip_wait(object, "swpdea");
 
 	/*
 	 * Free all remaining metadata.  We only bother to free it from
 	 * the swap meta data.  We do not attempt to free swapblk's still
 	 * associated with vm_page_t's for this object.  We do not care
 	 * if paging is still in progress on some objects.
 	 */
 	swp_pager_meta_free_all(object);
 	object->handle = NULL;
 	object->type = OBJT_DEAD;
 
 	/*
 	 * Release the allocation charge.
 	 */
 	if (object->cred != NULL) {
 		swap_release_by_cred(object->charge, object->cred);
 		object->charge = 0;
 		crfree(object->cred);
 		object->cred = NULL;
 	}
 
 	/*
 	 * Hide the object from swap_pager_swapoff().
 	 */
 	vm_object_clear_flag(object, OBJ_SWAP);
 }
 
 /************************************************************************
  *			SWAP PAGER BITMAP ROUTINES			*
  ************************************************************************/
 
 /*
  * SWP_PAGER_GETSWAPSPACE() -	allocate raw swap space
  *
  *	Allocate swap for up to the requested number of pages.  The
  *	starting swap block number (a page index) is returned or
  *	SWAPBLK_NONE if the allocation failed.
  *
  *	Also has the side effect of advising that somebody made a mistake
  *	when they configured swap and didn't configure enough.
  *
  *	This routine may not sleep.
  *
  *	We allocate in round-robin fashion from the configured devices.
  */
 static daddr_t
 swp_pager_getswapspace(int *io_npages)
 {
 	daddr_t blk;
 	struct swdevt *sp;
 	int mpages, npages;
 
 	KASSERT(*io_npages >= 1,
 	    ("%s: npages not positive", __func__));
 	blk = SWAPBLK_NONE;
 	mpages = *io_npages;
 	npages = imin(BLIST_MAX_ALLOC, mpages);
 	mtx_lock(&sw_dev_mtx);
 	sp = swdevhd;
 	while (!TAILQ_EMPTY(&swtailq)) {
 		if (sp == NULL)
 			sp = TAILQ_FIRST(&swtailq);
 		if ((sp->sw_flags & SW_CLOSING) == 0)
 			blk = blist_alloc(sp->sw_blist, &npages, mpages);
 		if (blk != SWAPBLK_NONE)
 			break;
 		sp = TAILQ_NEXT(sp, sw_list);
 		if (swdevhd == sp) {
 			if (npages == 1)
 				break;
 			mpages = npages - 1;
 			npages >>= 1;
 		}
 	}
 	if (blk != SWAPBLK_NONE) {
 		*io_npages = npages;
 		blk += sp->sw_first;
 		sp->sw_used += npages;
 		swap_pager_avail -= npages;
 		swp_sizecheck();
 		swdevhd = TAILQ_NEXT(sp, sw_list);
 	} else {
 		if (!swap_pager_full) {
 			printf("swp_pager_getswapspace(%d): failed\n",
 			    *io_npages);
 			swap_pager_full = swap_pager_almost_full = true;
 		}
 		swdevhd = NULL;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	return (blk);
 }
 
 static bool
 swp_pager_isondev(daddr_t blk, struct swdevt *sp)
 {
 
 	return (blk >= sp->sw_first && blk < sp->sw_end);
 }
 
 static void
 swp_pager_strategy(struct buf *bp)
 {
 	struct swdevt *sp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (swp_pager_isondev(bp->b_blkno, sp)) {
 			mtx_unlock(&sw_dev_mtx);
 			if ((sp->sw_flags & SW_UNMAPPED) != 0 &&
 			    unmapped_buf_allowed) {
 				bp->b_data = unmapped_buf;
 				bp->b_offset = 0;
 			} else {
 				pmap_qenter((vm_offset_t)bp->b_data,
 				    &bp->b_pages[0], bp->b_bcount / PAGE_SIZE);
 			}
 			sp->sw_strategy(bp, sp);
 			return;
 		}
 	}
 	panic("Swapdev not found");
 }
 
 /*
  * SWP_PAGER_FREESWAPSPACE() -	free raw swap space
  *
  *	This routine returns the specified swap blocks back to the bitmap.
  *
  *	This routine may not sleep.
  */
 static void
 swp_pager_freeswapspace(const struct page_range *range)
 {
 	daddr_t blk, npages;
 	struct swdevt *sp;
 
 	blk = range->start;
 	npages = range->num;
 	if (npages == 0)
 		return;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (swp_pager_isondev(blk, sp)) {
 			sp->sw_used -= npages;
 			/*
 			 * If we are attempting to stop swapping on
 			 * this device, we don't want to mark any
 			 * blocks free lest they be reused.
 			 */
 			if ((sp->sw_flags & SW_CLOSING) == 0) {
 				blist_free(sp->sw_blist, blk - sp->sw_first,
 				    npages);
 				swap_pager_avail += npages;
 				swp_sizecheck();
 			}
 			mtx_unlock(&sw_dev_mtx);
 			return;
 		}
 	}
 	panic("Swapdev not found");
 }
 
 /*
  * SYSCTL_SWAP_FRAGMENTATION() -	produce raw swap space stats
  */
 static int
 sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	struct swdevt *sp;
 	const char *devname;
 	int error;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (vn_isdisk(sp->sw_vp))
 			devname = devtoname(sp->sw_vp->v_rdev);
 		else
 			devname = "[file]";
 		sbuf_printf(&sbuf, "\nFree space on device %s:\n", devname);
 		blist_stats(sp->sw_blist, &sbuf);
 	}
 	mtx_unlock(&sw_dev_mtx);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 /*
  * SWAP_PAGER_FREESPACE() -	frees swap blocks associated with a page
  *				range within an object.
  *
  *	This routine removes swapblk assignments from swap metadata.
  *
  *	The external callers of this routine typically have already destroyed
  *	or renamed vm_page_t's associated with this range in the object so
  *	we should be ok.
  *
  *	The object must be locked.
  */
 void
 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size,
     vm_size_t *freed)
 {
 	MPASS((object->flags & OBJ_SWAP) != 0);
 
 	swp_pager_meta_free(object, start, size, freed);
 }
 
 static void
 swap_pager_freespace_pgo(vm_object_t object, vm_pindex_t start, vm_size_t size)
 {
 	MPASS((object->flags & OBJ_SWAP) != 0);
 
 	swp_pager_meta_free(object, start, size, NULL);
 }
 
 /*
  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
  *
  *	Assigns swap blocks to the specified range within the object.  The
  *	swap blocks are not zeroed.  Any previous swap assignment is destroyed.
  *
  *	Returns 0 on success, -1 on failure.
  */
 int
 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_pindex_t size)
 {
 	struct pctrie_iter blks;
 	struct page_range range;
 	daddr_t addr, blk;
 	vm_pindex_t i, j;
 	int n;
 
 	swp_pager_init_freerange(&range);
 	VM_OBJECT_WLOCK(object);
 	swblk_iter_init_only(&blks, object);
 	for (i = 0; i < size; i += n) {
 		n = MIN(size - i, INT_MAX);
 		blk = swp_pager_getswapspace(&n);
 		if (blk == SWAPBLK_NONE) {
 			swp_pager_meta_free(object, start, i, NULL);
 			VM_OBJECT_WUNLOCK(object);
 			return (-1);
 		}
 		for (j = 0; j < n; ++j) {
 			addr = swp_pager_meta_build(&blks, object,
 			    start + i + j, blk + j, false);
 			if (addr != SWAPBLK_NONE)
 				swp_pager_update_freerange(&range, addr);
 		}
 	}
 	swp_pager_freeswapspace(&range);
 	VM_OBJECT_WUNLOCK(object);
 	return (0);
 }
 
 /*
  * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
  *			and destroy the source.
  *
  *	Copy any valid swapblks from the source to the destination.  In
  *	cases where both the source and destination have a valid swapblk,
  *	we keep the destination's.
  *
  *	This routine is allowed to sleep.  It may sleep allocating metadata
  *	indirectly through swp_pager_meta_build().
  *
  *	The source object contains no vm_page_t's (which is just as well)
  *
  *	The source and destination objects must be locked.
  *	Both object locks may temporarily be released.
  */
 void
 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
     vm_pindex_t offset, int destroysource)
 {
 	VM_OBJECT_ASSERT_WLOCKED(srcobject);
 	VM_OBJECT_ASSERT_WLOCKED(dstobject);
 
 	/*
 	 * If destroysource is set, we remove the source object from the
 	 * swap_pager internal queue now.
 	 */
 	if (destroysource && (srcobject->flags & OBJ_ANON) == 0 &&
 	    srcobject->handle != NULL) {
 		VM_OBJECT_WUNLOCK(srcobject);
 		VM_OBJECT_WUNLOCK(dstobject);
 		sx_xlock(&sw_alloc_sx);
 		TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject,
 		    pager_object_list);
 		sx_xunlock(&sw_alloc_sx);
 		VM_OBJECT_WLOCK(dstobject);
 		VM_OBJECT_WLOCK(srcobject);
 	}
 
 	/*
 	 * Transfer source to destination.
 	 */
 	swp_pager_meta_transfer(srcobject, dstobject, offset, dstobject->size);
 
 	/*
 	 * Free left over swap blocks in source.
 	 */
 	if (destroysource)
 		swp_pager_meta_free_all(srcobject);
 }
 
 /*
  * SWP_PAGER_HASPAGE_ITER() -	determine if we have good backing store for
  *				the requested page, accessed with the given
  *				iterator.
  *
  *	We determine whether good backing store exists for the requested
  *	page and return TRUE if it does, FALSE if it doesn't.
  *
  *	If TRUE, we also try to determine how much valid, contiguous backing
  *	store exists before and after the requested page.
  */
 static boolean_t
 swp_pager_haspage_iter(vm_pindex_t pindex, int *before, int *after,
     struct pctrie_iter *blks)
 {
 	daddr_t blk, blk0;
 	int i;
 
 	/*
 	 * do we have good backing store at the requested index ?
 	 */
 	blk0 = swp_pager_meta_lookup(blks, pindex);
 	if (blk0 == SWAPBLK_NONE) {
 		if (before)
 			*before = 0;
 		if (after)
 			*after = 0;
 		return (FALSE);
 	}
 
 	/*
 	 * find backwards-looking contiguous good backing store
 	 */
 	if (before != NULL) {
 		for (i = 1; i < SWB_NPAGES; i++) {
 			if (i > pindex)
 				break;
 			blk = swp_pager_meta_lookup(blks, pindex - i);
 			if (blk != blk0 - i)
 				break;
 		}
 		*before = i - 1;
 	}
 
 	/*
 	 * find forward-looking contiguous good backing store
 	 */
 	if (after != NULL) {
 		for (i = 1; i < SWB_NPAGES; i++) {
 			blk = swp_pager_meta_lookup(blks, pindex + i);
 			if (blk != blk0 + i)
 				break;
 		}
 		*after = i - 1;
 	}
 	return (TRUE);
 }
 
 /*
  * SWAP_PAGER_HASPAGE() -	determine if we have good backing store for
  *				the requested page, in the given object.
  *
  *	We determine whether good backing store exists for the requested
  *	page and return TRUE if it does, FALSE if it doesn't.
  *
  *	If TRUE, we also try to determine how much valid, contiguous backing
  *	store exists before and after the requested page.
  */
 static boolean_t
 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
     int *after)
 {
 	struct pctrie_iter blks;
 
 	swblk_iter_init_only(&blks, object);
 	return (swp_pager_haspage_iter(pindex, before, after, &blks));
 }
 
 static void
 swap_pager_unswapped_acct(vm_page_t m)
 {
 	KASSERT((m->object->flags & OBJ_SWAP) != 0,
 	    ("Free object not swappable"));
 	if ((m->a.flags & PGA_SWAP_FREE) != 0)
 		counter_u64_add(swap_free_completed, 1);
 	vm_page_aflag_clear(m, PGA_SWAP_FREE | PGA_SWAP_SPACE);
 
 	/*
 	 * The meta data only exists if the object is OBJT_SWAP
 	 * and even then might not be allocated yet.
 	 */
 }
 
 /*
  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
  *
  *	This removes any associated swap backing store, whether valid or
  *	not, from the page.
  *
  *	This routine is typically called when a page is made dirty, at
  *	which point any associated swap can be freed.  MADV_FREE also
  *	calls us in a special-case situation
  *
  *	NOTE!!!  If the page is clean and the swap was valid, the caller
  *	should make the page dirty before calling this routine.  This routine
  *	does NOT change the m->dirty status of the page.  Also: MADV_FREE
  *	depends on it.
  *
  *	This routine may not sleep.
  *
  *	The object containing the page may be locked.
  */
 static void
 swap_pager_unswapped(vm_page_t m)
 {
 	struct page_range range;
 	struct swblk *sb;
 	vm_object_t obj;
 
 	/*
 	 * Handle enqueing deferred frees first.  If we do not have the
 	 * object lock we wait for the page daemon to clear the space.
 	 */
 	obj = m->object;
 	if (!VM_OBJECT_WOWNED(obj)) {
 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
 		/*
 		 * The caller is responsible for synchronization but we
 		 * will harmlessly handle races.  This is typically provided
 		 * by only calling unswapped() when a page transitions from
 		 * clean to dirty.
 		 */
 		if ((m->a.flags & (PGA_SWAP_SPACE | PGA_SWAP_FREE)) ==
 		    PGA_SWAP_SPACE) {
 			vm_page_aflag_set(m, PGA_SWAP_FREE);
 			counter_u64_add(swap_free_deferred, 1);
 		}
 		return;
 	}
 	swap_pager_unswapped_acct(m);
 
 	sb = swblk_lookup(m->object, m->pindex);
 	if (sb == NULL)
 		return;
 	range.start = sb->d[m->pindex % SWAP_META_PAGES];
 	if (range.start == SWAPBLK_NONE)
 		return;
 	range.num = 1;
 	swp_pager_freeswapspace(&range);
 	sb->d[m->pindex % SWAP_META_PAGES] = SWAPBLK_NONE;
 	swp_pager_free_empty_swblk(m->object, sb);
 }
 
 /*
  * swap_pager_getpages_locked() - bring pages in from swap
  *
  *	Attempt to page in the pages in array "ma" of length "count".  The
  *	caller may optionally specify that additional pages preceding and
  *	succeeding the specified range be paged in.  The number of such pages
  *	is returned in the "a_rbehind" and "a_rahead" parameters, and they will
  *	be in the inactive queue upon return.
  *
  *	The pages in "ma" must be busied and will remain busied upon return.
  */
 static int
 swap_pager_getpages_locked(struct pctrie_iter *blks, vm_object_t object,
     vm_page_t *ma, int count, int *a_rbehind, int *a_rahead, struct buf *bp)
 {
 	vm_pindex_t pindex;
 	int rahead, rbehind;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	KASSERT((object->flags & OBJ_SWAP) != 0,
 	    ("%s: object not swappable", __func__));
 	pindex = ma[0]->pindex;
 	if (!swp_pager_haspage_iter(pindex, &rbehind, &rahead, blks)) {
 		VM_OBJECT_WUNLOCK(object);
 		uma_zfree(swrbuf_zone, bp);
 		return (VM_PAGER_FAIL);
 	}
 
 	KASSERT(count - 1 <= rahead,
 	    ("page count %d extends beyond swap block", count));
 
 	/*
 	 * Do not transfer any pages other than those that are xbusied
 	 * when running during a split or collapse operation.  This
 	 * prevents clustering from re-creating pages which are being
 	 * moved into another object.
 	 */
 	if ((object->flags & (OBJ_SPLIT | OBJ_DEAD)) != 0) {
 		rahead = count - 1;
 		rbehind = 0;
 	}
 	/* Clip readbehind/ahead ranges to exclude already resident pages. */
 	rbehind = a_rbehind != NULL ? imin(*a_rbehind, rbehind) : 0;
 	rahead = a_rahead != NULL ? imin(*a_rahead, rahead - count + 1) : 0;
 	/* Allocate pages. */
 	vm_object_prepare_buf_pages(object, bp->b_pages, count, &rbehind,
 	    &rahead, ma);
 	bp->b_npages = rbehind + count + rahead;
 	for (int i = 0; i < bp->b_npages; i++)
 		bp->b_pages[i]->oflags |= VPO_SWAPINPROG;
 	bp->b_blkno = swp_pager_meta_lookup(blks, pindex - rbehind);
 	KASSERT(bp->b_blkno != SWAPBLK_NONE,
 	    ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex));
 
 	vm_object_pip_add(object, bp->b_npages);
 	VM_OBJECT_WUNLOCK(object);
 	MPASS((bp->b_flags & B_MAXPHYS) != 0);
 
 	/* Report back actual behind/ahead read. */
 	if (a_rbehind != NULL)
 		*a_rbehind = rbehind;
 	if (a_rahead != NULL)
 		*a_rahead = rahead;
 
 	bp->b_flags |= B_PAGING;
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = swp_pager_async_iodone;
 	bp->b_rcred = crhold(thread0.td_ucred);
 	bp->b_wcred = crhold(thread0.td_ucred);
 	bp->b_bufsize = bp->b_bcount = ptoa(bp->b_npages);
 	bp->b_pgbefore = rbehind;
 	bp->b_pgafter = rahead;
 
 	VM_CNT_INC(v_swapin);
 	VM_CNT_ADD(v_swappgsin, bp->b_npages);
 
 	/*
 	 * perform the I/O.  NOTE!!!  bp cannot be considered valid after
 	 * this point because we automatically release it on completion.
 	 * Instead, we look at the one page we are interested in which we
 	 * still hold a lock on even through the I/O completion.
 	 *
 	 * The other pages in our ma[] array are also released on completion,
 	 * so we cannot assume they are valid anymore either.
 	 *
 	 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 	 */
 	BUF_KERNPROC(bp);
 	swp_pager_strategy(bp);
 
 	/*
 	 * Wait for the pages we want to complete.  VPO_SWAPINPROG is always
 	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
 	 * is set in the metadata for each page in the request.
 	 */
 	VM_OBJECT_WLOCK(object);
 	/* This could be implemented more efficiently with aflags */
 	while ((ma[0]->oflags & VPO_SWAPINPROG) != 0) {
 		ma[0]->oflags |= VPO_SWAPSLEEP;
 		VM_CNT_INC(v_intrans);
 		if (VM_OBJECT_SLEEP(object, &object->handle, PSWP,
 		    "swread", hz * 20)) {
 			printf(
 "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n",
 			    bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount);
 		}
 	}
 	VM_OBJECT_WUNLOCK(object);
 
 	/*
 	 * If we had an unrecoverable read error pages will not be valid.
 	 */
 	for (int i = 0; i < count; i++)
 		if (ma[i]->valid != VM_PAGE_BITS_ALL)
 			return (VM_PAGER_ERROR);
 
 	return (VM_PAGER_OK);
 
 	/*
 	 * A final note: in a low swap situation, we cannot deallocate swap
 	 * and mark a page dirty here because the caller is likely to mark
 	 * the page clean when we return, causing the page to possibly revert
 	 * to all-zero's later.
 	 */
 }
 
 static int
 swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count,
     int *rbehind, int *rahead)
 {
 	struct buf *bp;
 	struct pctrie_iter blks;
 
 	bp = uma_zalloc(swrbuf_zone, M_WAITOK);
 	VM_OBJECT_WLOCK(object);
 	swblk_iter_init_only(&blks, object);
 	return (swap_pager_getpages_locked(&blks, object, ma, count, rbehind,
 	    rahead, bp));
 }
 
 /*
  * 	swap_pager_getpages_async():
  *
  *	Right now this is emulation of asynchronous operation on top of
  *	swap_pager_getpages().
  */
 static int
 swap_pager_getpages_async(vm_object_t object, vm_page_t *ma, int count,
     int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg)
 {
 	int r, error;
 
 	r = swap_pager_getpages(object, ma, count, rbehind, rahead);
 	switch (r) {
 	case VM_PAGER_OK:
 		error = 0;
 		break;
 	case VM_PAGER_ERROR:
 		error = EIO;
 		break;
 	case VM_PAGER_FAIL:
 		error = EINVAL;
 		break;
 	default:
 		panic("unhandled swap_pager_getpages() error %d", r);
 	}
 	(iodone)(arg, ma, count, error);
 
 	return (r);
 }
 
 /*
  *	swap_pager_putpages:
  *
  *	Assign swap (if necessary) and initiate I/O on the specified pages.
  *
  *	In a low memory situation we may block in VOP_STRATEGY(), but the new
  *	vm_page reservation system coupled with properly written VFS devices
  *	should ensure that no low-memory deadlock occurs.  This is an area
  *	which needs work.
  *
  *	The parent has N vm_object_pip_add() references prior to
  *	calling us and will remove references for rtvals[] that are
  *	not set to VM_PAGER_PEND.  We need to remove the rest on I/O
  *	completion.
  *
  *	The parent has soft-busy'd the pages it passes us and will unbusy
  *	those whose rtvals[] entry is not set to VM_PAGER_PEND on return.
  *	We need to unbusy the rest on I/O completion.
  */
 static void
 swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count,
     int flags, int *rtvals)
 {
 	struct pctrie_iter blks;
 	struct page_range range;
 	struct buf *bp;
 	daddr_t addr, blk;
 	vm_page_t mreq;
 	int i, j, n;
 	bool async;
 
 	KASSERT(count == 0 || ma[0]->object == object,
 	    ("%s: object mismatch %p/%p",
 	    __func__, object, ma[0]->object));
 
 	VM_OBJECT_WUNLOCK(object);
 	async = curproc == pageproc && (flags & VM_PAGER_PUT_SYNC) == 0;
 	swp_pager_init_freerange(&range);
 
 	/*
 	 * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
 	 * The page is left dirty until the pageout operation completes
 	 * successfully.
 	 */
 	for (i = 0; i < count; i += n) {
 		/* Maximum I/O size is limited by maximum swap block size. */
 		n = min(count - i, nsw_cluster_max);
 
 		if (async) {
 			mtx_lock(&swbuf_mtx);
 			while (nsw_wcount_async == 0)
 				msleep(&nsw_wcount_async, &swbuf_mtx, PVM,
 				    "swbufa", 0);
 			nsw_wcount_async--;
 			mtx_unlock(&swbuf_mtx);
 		}
 
 		/* Get a block of swap of size up to size n. */
 		blk = swp_pager_getswapspace(&n);
 		if (blk == SWAPBLK_NONE) {
 			mtx_lock(&swbuf_mtx);
 			if (++nsw_wcount_async == 1)
 				wakeup(&nsw_wcount_async);
 			mtx_unlock(&swbuf_mtx);
 			for (j = 0; j < n; ++j)
 				rtvals[i + j] = VM_PAGER_FAIL;
 			continue;
 		}
 		VM_OBJECT_WLOCK(object);
 		swblk_iter_init_only(&blks, object);
 		for (j = 0; j < n; ++j) {
 			mreq = ma[i + j];
 			vm_page_aflag_clear(mreq, PGA_SWAP_FREE);
 			KASSERT(mreq->object == object,
 			    ("%s: object mismatch %p/%p",
 			    __func__, mreq->object, object));
 			addr = swp_pager_meta_build(&blks, object,
 			    mreq->pindex, blk + j, false);
 			if (addr != SWAPBLK_NONE)
 				swp_pager_update_freerange(&range, addr);
 			MPASS(mreq->dirty == VM_PAGE_BITS_ALL);
 			mreq->oflags |= VPO_SWAPINPROG;
 		}
 		VM_OBJECT_WUNLOCK(object);
 
 		bp = uma_zalloc(swwbuf_zone, M_WAITOK);
 		MPASS((bp->b_flags & B_MAXPHYS) != 0);
 		if (async)
 			bp->b_flags |= B_ASYNC;
 		bp->b_flags |= B_PAGING;
 		bp->b_iocmd = BIO_WRITE;
 
 		bp->b_rcred = crhold(thread0.td_ucred);
 		bp->b_wcred = crhold(thread0.td_ucred);
 		bp->b_bcount = PAGE_SIZE * n;
 		bp->b_bufsize = PAGE_SIZE * n;
 		bp->b_blkno = blk;
 		for (j = 0; j < n; j++)
 			bp->b_pages[j] = ma[i + j];
 		bp->b_npages = n;
 
 		/*
 		 * Must set dirty range for NFS to work.
 		 */
 		bp->b_dirtyoff = 0;
 		bp->b_dirtyend = bp->b_bcount;
 
 		VM_CNT_INC(v_swapout);
 		VM_CNT_ADD(v_swappgsout, bp->b_npages);
 
 		/*
 		 * We unconditionally set rtvals[] to VM_PAGER_PEND so that we
 		 * can call the async completion routine at the end of a
 		 * synchronous I/O operation.  Otherwise, our caller would
 		 * perform duplicate unbusy and wakeup operations on the page
 		 * and object, respectively.
 		 */
 		for (j = 0; j < n; j++)
 			rtvals[i + j] = VM_PAGER_PEND;
 
 		/*
 		 * asynchronous
 		 *
 		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy.
 		 */
 		if (async) {
 			bp->b_iodone = swp_pager_async_iodone;
 			BUF_KERNPROC(bp);
 			swp_pager_strategy(bp);
 			continue;
 		}
 
 		/*
 		 * synchronous
 		 *
 		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy.
 		 */
 		bp->b_iodone = bdone;
 		swp_pager_strategy(bp);
 
 		/*
 		 * Wait for the sync I/O to complete.
 		 */
 		bwait(bp, PVM, "swwrt");
 
 		/*
 		 * Now that we are through with the bp, we can call the
 		 * normal async completion, which frees everything up.
 		 */
 		swp_pager_async_iodone(bp);
 	}
 	swp_pager_freeswapspace(&range);
 	VM_OBJECT_WLOCK(object);
 }
 
 /*
  *	swp_pager_async_iodone:
  *
  *	Completion routine for asynchronous reads and writes from/to swap.
  *	Also called manually by synchronous code to finish up a bp.
  *
  *	This routine may not sleep.
  */
 static void
 swp_pager_async_iodone(struct buf *bp)
 {
 	int i;
 	vm_object_t object = NULL;
 
 	/*
 	 * Report error - unless we ran out of memory, in which case
 	 * we've already logged it in swapgeom_strategy().
 	 */
 	if (bp->b_ioflags & BIO_ERROR && bp->b_error != ENOMEM) {
 		printf(
 		    "swap_pager: I/O error - %s failed; blkno %ld,"
 			"size %ld, error %d\n",
 		    ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
 		    (long)bp->b_blkno,
 		    (long)bp->b_bcount,
 		    bp->b_error
 		);
 	}
 
 	/*
 	 * remove the mapping for kernel virtual
 	 */
 	if (buf_mapped(bp))
 		pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
 	else
 		bp->b_data = bp->b_kvabase;
 
 	if (bp->b_npages) {
 		object = bp->b_pages[0]->object;
 		VM_OBJECT_WLOCK(object);
 	}
 
 	/*
 	 * cleanup pages.  If an error occurs writing to swap, we are in
 	 * very serious trouble.  If it happens to be a disk error, though,
 	 * we may be able to recover by reassigning the swap later on.  So
 	 * in this case we remove the m->swapblk assignment for the page
 	 * but do not free it in the rlist.  The errornous block(s) are thus
 	 * never reallocated as swap.  Redirty the page and continue.
 	 */
 	for (i = 0; i < bp->b_npages; ++i) {
 		vm_page_t m = bp->b_pages[i];
 
 		m->oflags &= ~VPO_SWAPINPROG;
 		if (m->oflags & VPO_SWAPSLEEP) {
 			m->oflags &= ~VPO_SWAPSLEEP;
 			wakeup(&object->handle);
 		}
 
 		/* We always have space after I/O, successful or not. */
 		vm_page_aflag_set(m, PGA_SWAP_SPACE);
 
 		if (bp->b_ioflags & BIO_ERROR) {
 			/*
 			 * If an error occurs I'd love to throw the swapblk
 			 * away without freeing it back to swapspace, so it
 			 * can never be used again.  But I can't from an
 			 * interrupt.
 			 */
 			if (bp->b_iocmd == BIO_READ) {
 				/*
 				 * NOTE: for reads, m->dirty will probably
 				 * be overridden by the original caller of
 				 * getpages so don't play cute tricks here.
 				 */
 				vm_page_invalid(m);
 				if (i < bp->b_pgbefore ||
 				    i >= bp->b_npages - bp->b_pgafter)
 					vm_page_free_invalid(m);
 			} else {
 				/*
 				 * If a write error occurs, reactivate page
 				 * so it doesn't clog the inactive list,
 				 * then finish the I/O.
 				 */
 				MPASS(m->dirty == VM_PAGE_BITS_ALL);
 
 				/* PQ_UNSWAPPABLE? */
 				vm_page_activate(m);
 				vm_page_sunbusy(m);
 			}
 		} else if (bp->b_iocmd == BIO_READ) {
 			/*
 			 * NOTE: for reads, m->dirty will probably be
 			 * overridden by the original caller of getpages so
 			 * we cannot set them in order to free the underlying
 			 * swap in a low-swap situation.  I don't think we'd
 			 * want to do that anyway, but it was an optimization
 			 * that existed in the old swapper for a time before
 			 * it got ripped out due to precisely this problem.
 			 */
 			KASSERT(!pmap_page_is_mapped(m),
 			    ("swp_pager_async_iodone: page %p is mapped", m));
 			KASSERT(m->dirty == 0,
 			    ("swp_pager_async_iodone: page %p is dirty", m));
 
 			vm_page_valid(m);
 			if (i < bp->b_pgbefore ||
 			    i >= bp->b_npages - bp->b_pgafter)
 				vm_page_readahead_finish(m);
 		} else {
 			/*
 			 * For write success, clear the dirty
 			 * status, then finish the I/O ( which decrements the
 			 * busy count and possibly wakes waiter's up ).
 			 * A page is only written to swap after a period of
 			 * inactivity.  Therefore, we do not expect it to be
 			 * reused.
 			 */
 			KASSERT(!pmap_page_is_write_mapped(m),
 			    ("swp_pager_async_iodone: page %p is not write"
 			    " protected", m));
 			vm_page_undirty(m);
 			vm_page_deactivate_noreuse(m);
 			vm_page_sunbusy(m);
 		}
 	}
 
 	/*
 	 * adjust pip.  NOTE: the original parent may still have its own
 	 * pip refs on the object.
 	 */
 	if (object != NULL) {
 		vm_object_pip_wakeupn(object, bp->b_npages);
 		VM_OBJECT_WUNLOCK(object);
 	}
 
 	/*
 	 * swapdev_strategy() manually sets b_vp and b_bufobj before calling
 	 * bstrategy(). Set them back to NULL now we're done with it, or we'll
 	 * trigger a KASSERT in relpbuf().
 	 */
 	if (bp->b_vp) {
 		    bp->b_vp = NULL;
 		    bp->b_bufobj = NULL;
 	}
 	/*
 	 * release the physical I/O buffer
 	 */
 	if (bp->b_flags & B_ASYNC) {
 		mtx_lock(&swbuf_mtx);
 		if (++nsw_wcount_async == 1)
 			wakeup(&nsw_wcount_async);
 		mtx_unlock(&swbuf_mtx);
 	}
 	uma_zfree((bp->b_iocmd == BIO_READ) ? swrbuf_zone : swwbuf_zone, bp);
 }
 
 int
 swap_pager_nswapdev(void)
 {
 
 	return (nswapdev);
 }
 
 static void
 swp_pager_force_dirty(struct page_range *range, vm_page_t m, daddr_t *blk)
 {
 	vm_page_dirty(m);
 	swap_pager_unswapped_acct(m);
 	swp_pager_update_freerange(range, *blk);
 	*blk = SWAPBLK_NONE;
 	vm_page_launder(m);
 }
 
 u_long
 swap_pager_swapped_pages(vm_object_t object)
 {
 	struct pctrie_iter blks;
 	struct swblk *sb;
 	u_long res;
 	int i;
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 
 	if (swblk_is_empty(object))
 		return (0);
 
 	res = 0;
 	for (sb = swblk_iter_init(&blks, object, 0); sb != NULL;
 	    sb = swblk_iter_next(&blks)) {
 		for (i = 0; i < SWAP_META_PAGES; i++) {
 			if (sb->d[i] != SWAPBLK_NONE)
 				res++;
 		}
 	}
 	return (res);
 }
 
 /*
  *	swap_pager_swapoff_object:
  *
  *	Page in all of the pages that have been paged out for an object
  *	to a swap device.
  */
 static void
 swap_pager_swapoff_object(struct swdevt *sp, vm_object_t object,
     struct buf **bp)
 {
 	struct pctrie_iter blks, pages;
 	struct page_range range;
 	struct swblk *sb;
 	vm_page_t m;
 	int i, rahead, rv;
 	bool sb_empty;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & OBJ_SWAP) != 0,
 	    ("%s: Object not swappable", __func__));
 	KASSERT((object->flags & OBJ_DEAD) == 0,
 	    ("%s: Object already dead", __func__));
 	KASSERT((sp->sw_flags & SW_CLOSING) != 0,
 	    ("%s: Device not blocking further allocations", __func__));
 
 	vm_page_iter_init(&pages, object);
 	swp_pager_init_freerange(&range);
 	sb = swblk_iter_init(&blks, object, 0);
 	while (sb != NULL) {
 		sb_empty = true;
 		for (i = 0; i < SWAP_META_PAGES; i++) {
 			/* Skip an invalid block. */
 			if (sb->d[i] == SWAPBLK_NONE)
 				continue;
 			/* Skip a block not of this device. */
 			if (!swp_pager_isondev(sb->d[i], sp)) {
 				sb_empty = false;
 				continue;
 			}
 
 			/*
 			 * Look for a page corresponding to this block. If the
 			 * found page has pending operations, sleep and restart
 			 * the scan.
 			 */
 			m = vm_radix_iter_lookup(&pages, blks.index + i);
 			if (m != NULL && (m->oflags & VPO_SWAPINPROG) != 0) {
 				m->oflags |= VPO_SWAPSLEEP;
 				VM_OBJECT_SLEEP(object, &object->handle, PSWP,
 				    "swpoff", 0);
 				break;
 			}
 
 			/*
 			 * If the found page is valid, mark it dirty and free
 			 * the swap block.
 			 */
 			if (m != NULL && vm_page_all_valid(m)) {
 				swp_pager_force_dirty(&range, m, &sb->d[i]);
 				continue;
 			}
 			/* Is there a page we can acquire or allocate? */
 			if (m != NULL) {
 				if (!vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL))
 					break;
 			} else {
 				m = vm_page_alloc_iter(object, blks.index + i,
 				    VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL,
 				    &pages);
 				if (m == NULL)
 					break;
 			}
 
 			/* Get the page from swap, and restart the scan. */
 			vm_object_pip_add(object, 1);
 			rahead = SWAP_META_PAGES;
 			rv = swap_pager_getpages_locked(&blks, object, &m, 1,
 			    NULL, &rahead, *bp);
 			if (rv != VM_PAGER_OK)
 				panic("%s: read from swap failed: %d",
 				    __func__, rv);
 			*bp = uma_zalloc(swrbuf_zone, M_WAITOK);
 			VM_OBJECT_WLOCK(object);
 			vm_object_pip_wakeupn(object, 1);
 			KASSERT(vm_page_all_valid(m),
 			    ("%s: Page %p not all valid", __func__, m));
 			vm_page_deactivate_noreuse(m);
 			vm_page_xunbusy(m);
 			break;
 		}
 		if (i < SWAP_META_PAGES) {
 			/*
 			 * The object lock has been released and regained.
 			 * Perhaps the object is now dead.
 			 */
 			if ((object->flags & OBJ_DEAD) != 0) {
 				/*
 				 * Make sure that pending writes finish before
 				 * returning.
 				 */
 				vm_object_pip_wait(object, "swpoff");
 				swp_pager_meta_free_all(object);
 				break;
 			}
 
 			/*
 			 * The swapblk could have been freed, so reset the pages
 			 * iterator and search again for the first swblk at or
 			 * after blks.index.
 			 */
 			pctrie_iter_reset(&pages);
 			sb = swblk_iter_init(&blks, object, blks.index);
 			continue;
 		}
 		if (sb_empty) {
 			swblk_iter_remove(&blks);
 			uma_zfree(swblk_zone, sb);
 		}
 
 		/*
 		 * It is safe to advance to the next block.  No allocations
 		 * before blk.index have happened, even with the lock released,
 		 * because allocations on this device are blocked.
 		 */
 		sb = swblk_iter_next(&blks);
 	}
 	swp_pager_freeswapspace(&range);
 }
 
 /*
  *	swap_pager_swapoff:
  *
  *	Page in all of the pages that have been paged out to the
  *	given device.  The corresponding blocks in the bitmap must be
  *	marked as allocated and the device must be flagged SW_CLOSING.
  *	There may be no processes swapped out to the device.
  *
  *	This routine may block.
  */
 static void
 swap_pager_swapoff(struct swdevt *sp)
 {
 	vm_object_t object;
 	struct buf *bp;
 	int retries;
 
 	sx_assert(&swdev_syscall_lock, SA_XLOCKED);
 
 	retries = 0;
 full_rescan:
 	bp = uma_zalloc(swrbuf_zone, M_WAITOK);
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		if ((object->flags & OBJ_SWAP) == 0)
 			continue;
 		mtx_unlock(&vm_object_list_mtx);
 		/* Depends on type-stability. */
 		VM_OBJECT_WLOCK(object);
 
 		/*
 		 * Dead objects are eventually terminated on their own.
 		 */
 		if ((object->flags & OBJ_DEAD) != 0)
 			goto next_obj;
 
 		/*
 		 * Sync with fences placed after pctrie
 		 * initialization.  We must not access pctrie below
 		 * unless we checked that our object is swap and not
 		 * dead.
 		 */
 		atomic_thread_fence_acq();
 		if ((object->flags & OBJ_SWAP) == 0)
 			goto next_obj;
 
 		swap_pager_swapoff_object(sp, object, &bp);
 next_obj:
 		VM_OBJECT_WUNLOCK(object);
 		mtx_lock(&vm_object_list_mtx);
 	}
 	mtx_unlock(&vm_object_list_mtx);
 	uma_zfree(swrbuf_zone, bp);
 
 	if (sp->sw_used) {
 		/*
 		 * Objects may be locked or paging to the device being
 		 * removed, so we will miss their pages and need to
 		 * make another pass.  We have marked this device as
 		 * SW_CLOSING, so the activity should finish soon.
 		 */
 		retries++;
 		if (retries > 100) {
 			panic("swapoff: failed to locate %d swap blocks",
 			    sp->sw_used);
 		}
 		pause("swpoff", hz / 20);
 		goto full_rescan;
 	}
 	EVENTHANDLER_INVOKE(swapoff, sp);
 }
 
 /************************************************************************
  *				SWAP META DATA 				*
  ************************************************************************
  *
  *	These routines manipulate the swap metadata stored in the
  *	OBJT_SWAP object.
  *
  *	Swap metadata is implemented with a global hash and not directly
  *	linked into the object.  Instead the object simply contains
  *	appropriate tracking counters.
  */
 
 /*
  * SWP_PAGER_SWBLK_EMPTY() - is a range of blocks free?
  */
 static bool
 swp_pager_swblk_empty(struct swblk *sb, int start, int limit)
 {
 	int i;
 
 	MPASS(0 <= start && start <= limit && limit <= SWAP_META_PAGES);
 	for (i = start; i < limit; i++) {
 		if (sb->d[i] != SWAPBLK_NONE)
 			return (false);
 	}
 	return (true);
 }
 
 /*
  * SWP_PAGER_FREE_EMPTY_SWBLK() - frees if a block is free
  *
  *  Nothing is done if the block is still in use.
  */
 static void
 swp_pager_free_empty_swblk(vm_object_t object, struct swblk *sb)
 {
 
 	if (swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) {
 		swblk_lookup_remove(object, sb);
 		uma_zfree(swblk_zone, sb);
 	}
 }
 
 /*
  * SWP_PAGER_META_BUILD() -	add swap block to swap meta data for object
  *
  *	Try to add the specified swapblk to the object's swap metadata.  If
  *	nowait_noreplace is set, add the specified swapblk only if there is no
  *	previously assigned swapblk at pindex.  If the swapblk is invalid, and
  *	replaces a valid swapblk, empty swap metadata is freed.  If memory
  *	allocation fails, and nowait_noreplace is set, return the specified
  *	swapblk immediately to indicate failure; otherwise, wait and retry until
  *	memory allocation succeeds.  Return the previously assigned swapblk, if
  *	any.
  */
 static daddr_t
 swp_pager_meta_build(struct pctrie_iter *blks, vm_object_t object,
     vm_pindex_t pindex, daddr_t swapblk, bool nowait_noreplace)
 {
 	static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted;
 	struct swblk *sb, *sb1;
 	vm_pindex_t modpi;
 	daddr_t prev_swapblk;
 	int error, i;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	sb = swblk_iter_lookup(blks, pindex);
 	if (sb == NULL) {
 		if (swapblk == SWAPBLK_NONE)
 			return (SWAPBLK_NONE);
 		for (;;) {
 			sb = uma_zalloc(swblk_zone, M_NOWAIT | (curproc ==
 			    pageproc ? M_USE_RESERVE : 0));
 			if (sb != NULL) {
 				sb->p = rounddown(pindex, SWAP_META_PAGES);
 				for (i = 0; i < SWAP_META_PAGES; i++)
 					sb->d[i] = SWAPBLK_NONE;
 				if (atomic_cmpset_int(&swblk_zone_exhausted,
 				    1, 0))
 					printf("swblk zone ok\n");
 				break;
 			}
 			if (nowait_noreplace)
 				return (swapblk);
 			VM_OBJECT_WUNLOCK(object);
 			if (uma_zone_exhausted(swblk_zone)) {
 				if (atomic_cmpset_int(&swblk_zone_exhausted,
 				    0, 1))
 					printf("swap blk zone exhausted, "
 					    "increase kern.maxswzone\n");
 				vm_pageout_oom(VM_OOM_SWAPZ);
 				pause("swzonxb", 10);
 			} else
 				uma_zwait(swblk_zone);
 			VM_OBJECT_WLOCK(object);
 			sb = swblk_iter_reinit(blks, object, pindex);
 			if (sb != NULL)
 				/*
 				 * Somebody swapped out a nearby page,
 				 * allocating swblk at the pindex index,
 				 * while we dropped the object lock.
 				 */
 				goto allocated;
 		}
 		for (;;) {
 			error = swblk_iter_insert(blks, sb);
 			if (error == 0) {
 				if (atomic_cmpset_int(&swpctrie_zone_exhausted,
 				    1, 0))
 					printf("swpctrie zone ok\n");
 				break;
 			}
 			if (nowait_noreplace) {
 				uma_zfree(swblk_zone, sb);
 				return (swapblk);
 			}
 			VM_OBJECT_WUNLOCK(object);
 			if (uma_zone_exhausted(swpctrie_zone)) {
 				if (atomic_cmpset_int(&swpctrie_zone_exhausted,
 				    0, 1))
 					printf("swap pctrie zone exhausted, "
 					    "increase kern.maxswzone\n");
 				vm_pageout_oom(VM_OOM_SWAPZ);
 				pause("swzonxp", 10);
 			} else
 				uma_zwait(swpctrie_zone);
 			VM_OBJECT_WLOCK(object);
 			sb1 = swblk_iter_reinit(blks, object, pindex);
 			if (sb1 != NULL) {
 				uma_zfree(swblk_zone, sb);
 				sb = sb1;
 				goto allocated;
 			}
 		}
 	}
 allocated:
 	MPASS(sb->p == rounddown(pindex, SWAP_META_PAGES));
 
 	modpi = pindex % SWAP_META_PAGES;
 	/* Return prior contents of metadata. */
 	prev_swapblk = sb->d[modpi];
 	if (!nowait_noreplace || prev_swapblk == SWAPBLK_NONE) {
 		/* Enter block into metadata. */
 		sb->d[modpi] = swapblk;
 
 		/*
 		 * Free the swblk if we end up with the empty page run.
 		 */
 		if (swapblk == SWAPBLK_NONE &&
 		    swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) {
 			swblk_iter_remove(blks);
 			uma_zfree(swblk_zone, sb);
 		}
 	}
 	return (prev_swapblk);
 }
 
 /*
  * SWP_PAGER_META_TRANSFER() - transfer a range of blocks in the srcobject's
  * swap metadata into dstobject.
  *
  *	Blocks in src that correspond to holes in dst are transferred.  Blocks
  *	in src that correspond to blocks in dst are freed.
  */
 static void
 swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject,
     vm_pindex_t pindex, vm_pindex_t count)
 {
 	struct pctrie_iter dstblks, srcblks;
 	struct page_range range;
 	struct swblk *sb;
 	daddr_t blk, d[SWAP_META_PAGES];
 	vm_pindex_t last;
 	int d_mask, i, limit, start;
 	_Static_assert(8 * sizeof(d_mask) >= SWAP_META_PAGES,
 	    "d_mask not big enough");
 
 	VM_OBJECT_ASSERT_WLOCKED(srcobject);
 	VM_OBJECT_ASSERT_WLOCKED(dstobject);
 
 	if (count == 0 || swblk_is_empty(srcobject))
 		return;
 
 	swp_pager_init_freerange(&range);
 	d_mask = 0;
 	last = pindex + count;
 	swblk_iter_init_only(&dstblks, dstobject);
 	for (sb = swblk_iter_limit_init(&srcblks, srcobject, pindex, last),
 	    start = swblk_start(sb, pindex);
 	    sb != NULL; sb = swblk_iter_next(&srcblks), start = 0) {
 		limit = MIN(last - srcblks.index, SWAP_META_PAGES);
 		for (i = start; i < limit; i++) {
 			if (sb->d[i] == SWAPBLK_NONE)
 				continue;
 			blk = swp_pager_meta_build(&dstblks, dstobject,
 			    srcblks.index + i - pindex, sb->d[i], true);
 			if (blk == sb->d[i]) {
 				/*
 				 * Failed memory allocation stopped transfer;
 				 * save this block for transfer with lock
 				 * released.
 				 */
 				d[i] = blk;
 				d_mask |= 1 << i;
 			} else if (blk != SWAPBLK_NONE) {
 				/* Dst has a block at pindex, so free block. */
 				swp_pager_update_freerange(&range, sb->d[i]);
 			}
 			sb->d[i] = SWAPBLK_NONE;
 		}
 		if (swp_pager_swblk_empty(sb, 0, start) &&
 		    swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) {
 			swblk_iter_remove(&srcblks);
 			uma_zfree(swblk_zone, sb);
 		}
 		if (d_mask != 0) {
 			/* Finish block transfer, with the lock released. */
 			VM_OBJECT_WUNLOCK(srcobject);
 			do {
 				i = ffs(d_mask) - 1;
 				swp_pager_meta_build(&dstblks, dstobject,
 				    srcblks.index + i - pindex, d[i], false);
 				d_mask &= ~(1 << i);
 			} while (d_mask != 0);
 			VM_OBJECT_WLOCK(srcobject);
 
 			/*
 			 * While the lock was not held, the iterator path could
 			 * have become stale, so discard it.
 			 */
 			pctrie_iter_reset(&srcblks);
 		}
 	}
 	swp_pager_freeswapspace(&range);
 }
 
 /*
  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
  *
  *	Return freed swap blocks to the swap bitmap, and free emptied swblk
  *	metadata.  With 'freed' set, provide a count of freed blocks that were
  *	not associated with valid resident pages.
  */
 static void
 swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count,
     vm_size_t *freed)
 {
 	struct pctrie_iter blks, pages;
 	struct page_range range;
 	struct swblk *sb;
 	vm_page_t m;
 	vm_pindex_t last;
 	vm_size_t fc;
 	int i, limit, start;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	fc = 0;
 	if (count == 0 || swblk_is_empty(object))
 		goto out;
 
 	swp_pager_init_freerange(&range);
 	vm_page_iter_init(&pages, object);
 	last = pindex + count;
 	for (sb = swblk_iter_limit_init(&blks, object, pindex, last),
 	    start = swblk_start(sb, pindex);
 	    sb != NULL; sb = swblk_iter_next(&blks), start = 0) {
 		limit = MIN(last - blks.index, SWAP_META_PAGES);
 		for (i = start; i < limit; i++) {
 			if (sb->d[i] == SWAPBLK_NONE)
 				continue;
 			swp_pager_update_freerange(&range, sb->d[i]);
 			if (freed != NULL) {
 				m = vm_radix_iter_lookup(&pages, blks.index + i);
 				if (m == NULL || vm_page_none_valid(m))
 					fc++;
 			}
 			sb->d[i] = SWAPBLK_NONE;
 		}
 		if (swp_pager_swblk_empty(sb, 0, start) &&
 		    swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) {
 			swblk_iter_remove(&blks);
 			uma_zfree(swblk_zone, sb);
 		}
 	}
 	swp_pager_freeswapspace(&range);
 out:
 	if (freed != NULL)
 		*freed = fc;
 }
 
 static void
 swp_pager_meta_free_block(struct swblk *sb, void *rangev)
 {
 	struct page_range *range = rangev;
 
 	for (int i = 0; i < SWAP_META_PAGES; i++) {
 		if (sb->d[i] != SWAPBLK_NONE)
 			swp_pager_update_freerange(range, sb->d[i]);
 	}
 	uma_zfree(swblk_zone, sb);
 }
 
 /*
  * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
  *
  *	This routine locates and destroys all swap metadata associated with
  *	an object.
  */
 static void
 swp_pager_meta_free_all(vm_object_t object)
 {
 	struct page_range range;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	swp_pager_init_freerange(&range);
 	SWAP_PCTRIE_RECLAIM_CALLBACK(&object->un_pager.swp.swp_blks,
 	    swp_pager_meta_free_block, &range);
 	swp_pager_freeswapspace(&range);
 }
 
 /*
  * SWP_PAGER_METACTL() -  misc control of swap meta data.
  *
  *	This routine is capable of looking up, or removing swapblk
  *	assignments in the swap meta data.  It returns the swapblk being
  *	looked-up, popped, or SWAPBLK_NONE if the block was invalid.
  *
  *	When acting on a busy resident page and paging is in progress, we
  *	have to wait until paging is complete but otherwise can act on the
  *	busy page.
  */
 static daddr_t
 swp_pager_meta_lookup(struct pctrie_iter *blks, vm_pindex_t pindex)
 {
 	struct swblk *sb;
 
 	sb = swblk_iter_lookup(blks, pindex);
 	if (sb == NULL)
 		return (SWAPBLK_NONE);
 	return (sb->d[pindex % SWAP_META_PAGES]);
 }
 
 /*
  * Returns the least page index which is greater than or equal to the parameter
  * pindex and for which there is a swap block allocated.  Returns OBJ_MAX_SIZE
  * if are no allocated swap blocks for the object after the requested pindex.
  */
 static vm_pindex_t
 swap_pager_iter_find_least(struct pctrie_iter *blks, vm_pindex_t pindex)
 {
 	struct swblk *sb;
 	int i;
 
 	if ((sb = swblk_iter_lookup_ge(blks, pindex)) == NULL)
 		return (OBJ_MAX_SIZE);
 	if (blks->index < pindex) {
 		for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) {
 			if (sb->d[i] != SWAPBLK_NONE)
 				return (blks->index + i);
 		}
 		if ((sb = swblk_iter_next(blks)) == NULL)
 			return (OBJ_MAX_SIZE);
 	}
 	for (i = 0; i < SWAP_META_PAGES; i++) {
 		if (sb->d[i] != SWAPBLK_NONE)
 			return (blks->index + i);
 	}
 
 	/*
 	 * We get here if a swblk is present in the trie but it
 	 * doesn't map any blocks.
 	 */
 	MPASS(0);
 	return (OBJ_MAX_SIZE);
 }
 
 /*
  * Find the first index >= pindex that has either a valid page or a swap
  * block.
  */
 vm_pindex_t
 swap_pager_seek_data(vm_object_t object, vm_pindex_t pindex)
 {
 	struct pctrie_iter blks, pages;
 	vm_page_t m;
 	vm_pindex_t swap_index;
 
 	VM_OBJECT_ASSERT_RLOCKED(object);
 	vm_page_iter_init(&pages, object);
 	m = vm_radix_iter_lookup_ge(&pages, pindex);
 	if (m != NULL && pages.index == pindex && vm_page_any_valid(m))
 		return (pages.index);
 	swblk_iter_init_only(&blks, object);
 	swap_index = swap_pager_iter_find_least(&blks, pindex);
 	if (swap_index == pindex)
 		return (swap_index);
 
 	/*
 	 * Find the first resident page after m, before swap_index.
 	 */
 	while (m != NULL && pages.index < swap_index) {
 		if (vm_page_any_valid(m))
 			return (pages.index);
 		m = vm_radix_iter_step(&pages);
 	}
 	if (swap_index == OBJ_MAX_SIZE)
 		swap_index = object->size;
 	return (swap_index);
 }
 
 /*
  * Find the first index >= pindex that has neither a valid page nor a swap
  * block.
  */
 vm_pindex_t
 swap_pager_seek_hole(vm_object_t object, vm_pindex_t pindex)
 {
 	struct pctrie_iter blks, pages;
 	struct swblk *sb;
 	vm_page_t m;
 
 	VM_OBJECT_ASSERT_RLOCKED(object);
 	vm_page_iter_init(&pages, object);
 	swblk_iter_init_only(&blks, object);
 	while (((m = vm_radix_iter_lookup(&pages, pindex)) != NULL &&
 	    vm_page_any_valid(m)) ||
 	    ((sb = swblk_iter_lookup(&blks, pindex)) != NULL &&
 	    sb->d[pindex % SWAP_META_PAGES] != SWAPBLK_NONE))
 		pindex++;
 	return (pindex);
 }
 
 /*
  * Is every page in the backing object or swap shadowed in the parent, and
  * unbusy and valid in swap?
  */
 bool
 swap_pager_scan_all_shadowed(vm_object_t object)
 {
 	struct pctrie_iter backing_blks, backing_pages, blks, pages;
 	vm_object_t backing_object;
 	vm_page_t p, pp;
 	vm_pindex_t backing_offset_index, new_pindex, pi, pi_ubound, ps, pv;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
 
 	backing_object = object->backing_object;
 
 	if ((backing_object->flags & OBJ_ANON) == 0)
 		return (false);
 
 	KASSERT((object->flags & OBJ_ANON) != 0,
 	    ("Shadow object is not anonymous"));
 	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 	pi_ubound = MIN(backing_object->size,
 	    backing_offset_index + object->size);
 	vm_page_iter_init(&pages, object);
 	vm_page_iter_init(&backing_pages, backing_object);
 	swblk_iter_init_only(&blks, object);
 	swblk_iter_init_only(&backing_blks, backing_object);
 
 	/*
 	 * Only check pages inside the parent object's range and inside the
 	 * parent object's mapping of the backing object.
 	 */
 	pv = ps = pi = backing_offset_index - 1;
 	for (;;) {
 		if (pi == pv) {
 			p = vm_radix_iter_lookup_ge(&backing_pages, pv + 1);
 			pv = p != NULL ? p->pindex : backing_object->size;
 		}
 		if (pi == ps)
 			ps = swap_pager_iter_find_least(&backing_blks, ps + 1);
 		pi = MIN(pv, ps);
 		if (pi >= pi_ubound)
 			break;
 
 		if (pi == pv) {
 			/*
 			 * If the backing object page is busy a grandparent or
 			 * older page may still be undergoing CoW.  It is not
 			 * safe to collapse the backing object until it is
 			 * quiesced.
 			 */
 			if (vm_page_tryxbusy(p) == 0)
 				return (false);
 
 			/*
 			 * We raced with the fault handler that left newly
 			 * allocated invalid page on the object queue and
 			 * retried.
 			 */
 			if (!vm_page_all_valid(p))
 				break;
 
 			/*
 			 * Busy of p disallows fault handler to validate parent
 			 * page (pp, below).
 			 */
 		}
 
 		/*
 		 * See if the parent has the page or if the parent's object
 		 * pager has the page.  If the parent has the page but the page
 		 * is not valid, the parent's object pager must have the page.
 		 *
 		 * If this fails, the parent does not completely shadow the
 		 * object and we might as well give up now.
 		 */
 		new_pindex = pi - backing_offset_index;
 		pp = vm_radix_iter_lookup(&pages, new_pindex);
 
 		/*
 		 * The valid check here is stable due to object lock being
 		 * required to clear valid and initiate paging.
 		 */
 		if ((pp == NULL || vm_page_none_valid(pp)) &&
 		    !swp_pager_haspage_iter(new_pindex, NULL, NULL, &blks))
 			break;
 		if (pi == pv)
 			vm_page_xunbusy(p);
 	}
 	if (pi < pi_ubound) {
 		if (pi == pv)
 			vm_page_xunbusy(p);
 		return (false);
 	}
 	return (true);
 }
 
 /*
  * System call swapon(name) enables swapping on device name,
  * which must be in the swdevsw.  Return EBUSY
  * if already swapping on this device.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct swapon_args {
 	char *name;
 };
 #endif
 
 int
 sys_swapon(struct thread *td, struct swapon_args *uap)
 {
 	struct vattr attr;
 	struct vnode *vp;
 	struct nameidata nd;
 	int error;
 
 	error = priv_check(td, PRIV_SWAPON);
 	if (error)
 		return (error);
 
 	sx_xlock(&swdev_syscall_lock);
 
 	/*
 	 * Swap metadata may not fit in the KVM if we have physical
 	 * memory of >1GB.
 	 */
 	if (swblk_zone == NULL) {
 		error = ENOMEM;
 		goto done;
 	}
 
 	NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1,
 	    UIO_USERSPACE, uap->name);
 	error = namei(&nd);
 	if (error)
 		goto done;
 
 	NDFREE_PNBUF(&nd);
 	vp = nd.ni_vp;
 
 	if (vn_isdisk_error(vp, &error)) {
 		error = swapongeom(vp);
 	} else if (vp->v_type == VREG &&
 	    (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 	    (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) {
 		/*
 		 * Allow direct swapping to NFS regular files in the same
 		 * way that nfs_mountroot() sets up diskless swapping.
 		 */
 		error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
 	}
 
 	if (error != 0)
 		vput(vp);
 	else
 		VOP_UNLOCK(vp);
 done:
 	sx_xunlock(&swdev_syscall_lock);
 	return (error);
 }
 
 /*
  * Check that the total amount of swap currently configured does not
  * exceed half the theoretical maximum.  If it does, print a warning
  * message.
  */
 static void
 swapon_check_swzone(void)
 {
 
 	/* recommend using no more than half that amount */
 	if (swap_total > swap_maxpages / 2) {
 		printf("warning: total configured swap (%lu pages) "
 		    "exceeds maximum recommended amount (%lu pages).\n",
 		    swap_total, swap_maxpages / 2);
 		printf("warning: increase kern.maxswzone "
 		    "or reduce amount of swap.\n");
 	}
 }
 
 static int
 swaponsomething(struct vnode *vp, void *id, u_long nblks,
     sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags)
 {
 	struct swdevt *sp, *tsp;
 	daddr_t dvbase;
 
 	/*
 	 * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
 	 * First chop nblks off to page-align it, then convert.
 	 *
 	 * sw->sw_nblks is in page-sized chunks now too.
 	 */
 	nblks &= ~(ctodb(1) - 1);
 	nblks = dbtoc(nblks);
 	if (nblks == 0)
 		return (EXTERROR(EINVAL, "swap device too small"));
 
 	sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO);
 	sp->sw_blist = blist_create(nblks, M_WAITOK);
 	sp->sw_vp = vp;
 	sp->sw_id = id;
 	sp->sw_dev = dev;
 	sp->sw_nblks = nblks;
 	sp->sw_used = 0;
 	sp->sw_strategy = strategy;
 	sp->sw_close = close;
 	sp->sw_flags = flags;
 
 	/*
 	 * Do not free the first blocks in order to avoid overwriting
 	 * any bsd label at the front of the partition
 	 */
 	blist_free(sp->sw_blist, howmany(BBSIZE, PAGE_SIZE),
 	    nblks - howmany(BBSIZE, PAGE_SIZE));
 
 	dvbase = 0;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(tsp, &swtailq, sw_list) {
 		if (tsp->sw_end >= dvbase) {
 			/*
 			 * We put one uncovered page between the devices
 			 * in order to definitively prevent any cross-device
 			 * I/O requests
 			 */
 			dvbase = tsp->sw_end + 1;
 		}
 	}
 	sp->sw_first = dvbase;
 	sp->sw_end = dvbase + nblks;
 	TAILQ_INSERT_TAIL(&swtailq, sp, sw_list);
 	nswapdev++;
 	swap_pager_avail += nblks - howmany(BBSIZE, PAGE_SIZE);
 	swap_total += nblks;
 	swapon_check_swzone();
 	swp_sizecheck();
 	mtx_unlock(&sw_dev_mtx);
 	EVENTHANDLER_INVOKE(swapon, sp);
 
 	return (0);
 }
 
 /*
  * SYSCALL: swapoff(devname)
  *
  * Disable swapping on the given device.
  *
  * XXX: Badly designed system call: it should use a device index
  * rather than filename as specification.  We keep sw_vp around
  * only to make this work.
  */
 static int
 kern_swapoff(struct thread *td, const char *name, enum uio_seg name_seg,
     u_int flags)
 {
 	struct vnode *vp;
 	struct nameidata nd;
 	struct swdevt *sp;
 	int error;
 
 	error = priv_check(td, PRIV_SWAPOFF);
 	if (error != 0)
 		return (error);
 	if ((flags & ~(SWAPOFF_FORCE)) != 0)
 		return (EINVAL);
 
 	sx_xlock(&swdev_syscall_lock);
 
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, name_seg, name);
 	error = namei(&nd);
 	if (error)
 		goto done;
 	NDFREE_PNBUF(&nd);
 	vp = nd.ni_vp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_vp == vp)
 			break;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	if (sp == NULL) {
 		error = EINVAL;
 		goto done;
 	}
 	error = swapoff_one(sp, td->td_ucred, flags);
 done:
 	sx_xunlock(&swdev_syscall_lock);
 	return (error);
 }
 
 
 #ifdef COMPAT_FREEBSD13
 int
 freebsd13_swapoff(struct thread *td, struct freebsd13_swapoff_args *uap)
 {
 	return (kern_swapoff(td, uap->name, UIO_USERSPACE, 0));
 }
 #endif
 
 int
 sys_swapoff(struct thread *td, struct swapoff_args *uap)
 {
 	return (kern_swapoff(td, uap->name, UIO_USERSPACE, uap->flags));
 }
 
 static int
 swapoff_one(struct swdevt *sp, struct ucred *cred, u_int flags)
 {
 	u_long nblks;
 #ifdef MAC
 	int error;
 #endif
 
 	sx_assert(&swdev_syscall_lock, SA_XLOCKED);
 #ifdef MAC
 	(void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY);
 	error = mac_system_check_swapoff(cred, sp->sw_vp);
 	(void) VOP_UNLOCK(sp->sw_vp);
 	if (error != 0)
 		return (error);
 #endif
 	nblks = sp->sw_nblks;
 
 	/*
 	 * We can turn off this swap device safely only if the
 	 * available virtual memory in the system will fit the amount
 	 * of data we will have to page back in, plus an epsilon so
 	 * the system doesn't become critically low on swap space.
 	 * The vm_free_count() part does not account e.g. for clean
 	 * pages that can be immediately reclaimed without paging, so
 	 * this is a very rough estimation.
 	 *
 	 * On the other hand, not turning swap off on swapoff_all()
 	 * means that we can lose swap data when filesystems go away,
 	 * which is arguably worse.
 	 */
 	if ((flags & SWAPOFF_FORCE) == 0 &&
 	    vm_free_count() + swap_pager_avail < nblks + nswap_lowat)
 		return (ENOMEM);
 
 	/*
 	 * Prevent further allocations on this device.
 	 */
 	mtx_lock(&sw_dev_mtx);
 	sp->sw_flags |= SW_CLOSING;
 	swap_pager_avail -= blist_fill(sp->sw_blist, 0, nblks);
 	swap_total -= nblks;
 	mtx_unlock(&sw_dev_mtx);
 
 	/*
 	 * Page in the contents of the device and close it.
 	 */
 	swap_pager_swapoff(sp);
 
 	sp->sw_close(curthread, sp);
 	mtx_lock(&sw_dev_mtx);
 	sp->sw_id = NULL;
 	TAILQ_REMOVE(&swtailq, sp, sw_list);
 	nswapdev--;
 	if (nswapdev == 0)
 		swap_pager_full = swap_pager_almost_full = true;
 	if (swdevhd == sp)
 		swdevhd = NULL;
 	mtx_unlock(&sw_dev_mtx);
 	blist_destroy(sp->sw_blist);
 	free(sp, M_VMPGDATA);
 	return (0);
 }
 
 void
 swapoff_all(void)
 {
 	struct swdevt *sp, *spt;
 	const char *devname;
 	int error;
 
 	sx_xlock(&swdev_syscall_lock);
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
 		mtx_unlock(&sw_dev_mtx);
 		if (vn_isdisk(sp->sw_vp))
 			devname = devtoname(sp->sw_vp->v_rdev);
 		else
 			devname = "[file]";
 		error = swapoff_one(sp, thread0.td_ucred, SWAPOFF_FORCE);
 		if (error != 0) {
 			printf("Cannot remove swap device %s (error=%d), "
 			    "skipping.\n", devname, error);
 		} else if (bootverbose) {
 			printf("Swap device %s removed.\n", devname);
 		}
 		mtx_lock(&sw_dev_mtx);
 	}
 	mtx_unlock(&sw_dev_mtx);
 
 	sx_xunlock(&swdev_syscall_lock);
 }
 
 void
 swap_pager_status(int *total, int *used)
 {
 
 	*total = swap_total;
 	*used = swap_total - swap_pager_avail -
 	    nswapdev * howmany(BBSIZE, PAGE_SIZE);
 }
 
 int
 swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len)
 {
 	struct swdevt *sp;
 	const char *tmp_devname;
 	int error, n;
 
 	n = 0;
 	error = ENOENT;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (n != name) {
 			n++;
 			continue;
 		}
 		xs->xsw_version = XSWDEV_VERSION;
 		xs->xsw_dev = sp->sw_dev;
 		xs->xsw_flags = sp->sw_flags;
 		xs->xsw_nblks = sp->sw_nblks;
 		xs->xsw_used = sp->sw_used;
 		if (devname != NULL) {
 			if (vn_isdisk(sp->sw_vp))
 				tmp_devname = devtoname(sp->sw_vp->v_rdev);
 			else
 				tmp_devname = "[file]";
 			strncpy(devname, tmp_devname, len);
 		}
 		error = 0;
 		break;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD11)
 #define XSWDEV_VERSION_11	1
 struct xswdev11 {
 	u_int	xsw_version;
 	uint32_t xsw_dev;
 	int	xsw_flags;
 	int	xsw_nblks;
 	int     xsw_used;
 };
 #endif
 
 #if defined(__amd64__) && defined(COMPAT_FREEBSD32)
 struct xswdev32 {
 	u_int	xsw_version;
 	u_int	xsw_dev1, xsw_dev2;
 	int	xsw_flags;
 	int	xsw_nblks;
 	int     xsw_used;
 };
 #endif
 
 static int
 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
 {
 	struct xswdev xs;
 #if defined(__amd64__) && defined(COMPAT_FREEBSD32)
 	struct xswdev32 xs32;
 #endif
 #if defined(COMPAT_FREEBSD11)
 	struct xswdev11 xs11;
 #endif
 	int error;
 
 	if (arg2 != 1)			/* name length */
 		return (EINVAL);
 
 	memset(&xs, 0, sizeof(xs));
 	error = swap_dev_info(*(int *)arg1, &xs, NULL, 0);
 	if (error != 0)
 		return (error);
 #if defined(__amd64__) && defined(COMPAT_FREEBSD32)
 	if (req->oldlen == sizeof(xs32)) {
 		memset(&xs32, 0, sizeof(xs32));
 		xs32.xsw_version = XSWDEV_VERSION;
 		xs32.xsw_dev1 = xs.xsw_dev;
 		xs32.xsw_dev2 = xs.xsw_dev >> 32;
 		xs32.xsw_flags = xs.xsw_flags;
 		xs32.xsw_nblks = xs.xsw_nblks;
 		xs32.xsw_used = xs.xsw_used;
 		error = SYSCTL_OUT(req, &xs32, sizeof(xs32));
 		return (error);
 	}
 #endif
 #if defined(COMPAT_FREEBSD11)
 	if (req->oldlen == sizeof(xs11)) {
 		memset(&xs11, 0, sizeof(xs11));
 		xs11.xsw_version = XSWDEV_VERSION_11;
 		xs11.xsw_dev = xs.xsw_dev; /* truncation */
 		xs11.xsw_flags = xs.xsw_flags;
 		xs11.xsw_nblks = xs.xsw_nblks;
 		xs11.xsw_used = xs.xsw_used;
 		error = SYSCTL_OUT(req, &xs11, sizeof(xs11));
 		return (error);
 	}
 #endif
 	error = SYSCTL_OUT(req, &xs, sizeof(xs));
 	return (error);
 }
 
 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
     "Number of swap devices");
 SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE,
     sysctl_vm_swap_info,
     "Swap statistics by device");
 
 /*
  * Count the approximate swap usage in pages for a vmspace.  The
  * shadowed or not yet copied on write swap blocks are not accounted.
  * The map must be locked.
  */
 long
 vmspace_swap_count(struct vmspace *vmspace)
 {
 	struct pctrie_iter blks;
 	vm_map_t map;
 	vm_map_entry_t cur;
 	vm_object_t object;
 	struct swblk *sb;
 	vm_pindex_t e, pi;
 	long count;
 	int i, limit, start;
 
 	map = &vmspace->vm_map;
 	count = 0;
 
 	VM_MAP_ENTRY_FOREACH(cur, map) {
 		if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
 			continue;
 		object = cur->object.vm_object;
 		if (object == NULL || (object->flags & OBJ_SWAP) == 0)
 			continue;
 		VM_OBJECT_RLOCK(object);
 		if ((object->flags & OBJ_SWAP) == 0)
 			goto unlock;
 		pi = OFF_TO_IDX(cur->offset);
 		e = pi + OFF_TO_IDX(cur->end - cur->start);
 		for (sb = swblk_iter_limit_init(&blks, object, pi, e),
 		    start = swblk_start(sb, pi);
 		    sb != NULL; sb = swblk_iter_next(&blks), start = 0) {
 			limit = MIN(e - blks.index, SWAP_META_PAGES);
 			for (i = start; i < limit; i++) {
 				if (sb->d[i] != SWAPBLK_NONE)
 					count++;
 			}
 		}
 unlock:
 		VM_OBJECT_RUNLOCK(object);
 	}
 	return (count);
 }
 
 /*
  * GEOM backend
  *
  * Swapping onto disk devices.
  *
  */
 
 static g_orphan_t swapgeom_orphan;
 
 static struct g_class g_swap_class = {
 	.name = "SWAP",
 	.version = G_VERSION,
 	.orphan = swapgeom_orphan,
 };
 
 DECLARE_GEOM_CLASS(g_swap_class, g_class);
 
 static void
 swapgeom_close_ev(void *arg, int flags)
 {
 	struct g_consumer *cp;
 
 	cp = arg;
 	g_access(cp, -1, -1, 0);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 /*
  * Add a reference to the g_consumer for an inflight transaction.
  */
 static void
 swapgeom_acquire(struct g_consumer *cp)
 {
 
 	mtx_assert(&sw_dev_mtx, MA_OWNED);
 	cp->index++;
 }
 
 /*
  * Remove a reference from the g_consumer.  Post a close event if all
  * references go away, since the function might be called from the
  * biodone context.
  */
 static void
 swapgeom_release(struct g_consumer *cp, struct swdevt *sp)
 {
 
 	mtx_assert(&sw_dev_mtx, MA_OWNED);
 	cp->index--;
 	if (cp->index == 0) {
 		if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0)
 			sp->sw_id = NULL;
 	}
 }
 
 static void
 swapgeom_done(struct bio *bp2)
 {
 	struct swdevt *sp;
 	struct buf *bp;
 	struct g_consumer *cp;
 
 	bp = bp2->bio_caller2;
 	cp = bp2->bio_from;
 	bp->b_ioflags = bp2->bio_flags;
 	if (bp2->bio_error)
 		bp->b_ioflags |= BIO_ERROR;
 	bp->b_resid = bp->b_bcount - bp2->bio_completed;
 	bp->b_error = bp2->bio_error;
 	bp->b_caller1 = NULL;
 	bufdone(bp);
 	sp = bp2->bio_caller1;
 	mtx_lock(&sw_dev_mtx);
 	swapgeom_release(cp, sp);
 	mtx_unlock(&sw_dev_mtx);
 	g_destroy_bio(bp2);
 }
 
 static void
 swapgeom_strategy(struct buf *bp, struct swdevt *sp)
 {
 	struct bio *bio;
 	struct g_consumer *cp;
 
 	mtx_lock(&sw_dev_mtx);
 	cp = sp->sw_id;
 	if (cp == NULL) {
 		mtx_unlock(&sw_dev_mtx);
 		bp->b_error = ENXIO;
 		bp->b_ioflags |= BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 	swapgeom_acquire(cp);
 	mtx_unlock(&sw_dev_mtx);
 	if (bp->b_iocmd == BIO_WRITE)
 		bio = g_new_bio();
 	else
 		bio = g_alloc_bio();
 	if (bio == NULL) {
 		mtx_lock(&sw_dev_mtx);
 		swapgeom_release(cp, sp);
 		mtx_unlock(&sw_dev_mtx);
 		bp->b_error = ENOMEM;
 		bp->b_ioflags |= BIO_ERROR;
 		printf("swap_pager: cannot allocate bio\n");
 		bufdone(bp);
 		return;
 	}
 
 	bp->b_caller1 = bio;
 	bio->bio_caller1 = sp;
 	bio->bio_caller2 = bp;
 	bio->bio_cmd = bp->b_iocmd;
 	bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
 	bio->bio_length = bp->b_bcount;
 	bio->bio_done = swapgeom_done;
 	bio->bio_flags |= BIO_SWAP;
 	if (!buf_mapped(bp)) {
 		bio->bio_ma = bp->b_pages;
 		bio->bio_data = unmapped_buf;
 		bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
 		bio->bio_ma_n = bp->b_npages;
 		bio->bio_flags |= BIO_UNMAPPED;
 	} else {
 		bio->bio_data = bp->b_data;
 		bio->bio_ma = NULL;
 	}
 	g_io_request(bio, cp);
 	return;
 }
 
 static void
 swapgeom_orphan(struct g_consumer *cp)
 {
 	struct swdevt *sp;
 	int destroy;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_id == cp) {
 			sp->sw_flags |= SW_CLOSING;
 			break;
 		}
 	}
 	/*
 	 * Drop reference we were created with. Do directly since we're in a
 	 * special context where we don't have to queue the call to
 	 * swapgeom_close_ev().
 	 */
 	cp->index--;
 	destroy = ((sp != NULL) && (cp->index == 0));
 	if (destroy)
 		sp->sw_id = NULL;
 	mtx_unlock(&sw_dev_mtx);
 	if (destroy)
 		swapgeom_close_ev(cp, 0);
 }
 
 static void
 swapgeom_close(struct thread *td, struct swdevt *sw)
 {
 	struct g_consumer *cp;
 
 	mtx_lock(&sw_dev_mtx);
 	cp = sw->sw_id;
 	sw->sw_id = NULL;
 	mtx_unlock(&sw_dev_mtx);
 
 	/*
 	 * swapgeom_close() may be called from the biodone context,
 	 * where we cannot perform topology changes.  Delegate the
 	 * work to the events thread.
 	 */
 	if (cp != NULL)
 		g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL);
 }
 
 static int
 swapongeom_locked(struct cdev *dev, struct vnode *vp)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	static struct g_geom *gp;
 	struct swdevt *sp;
 	u_long nblks;
 	int error;
 
 	pp = g_dev_getprovider(dev);
 	if (pp == NULL)
 		return (ENODEV);
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		cp = sp->sw_id;
 		if (cp != NULL && cp->provider == pp) {
 			mtx_unlock(&sw_dev_mtx);
 			return (EBUSY);
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
 	if (gp == NULL)
 		gp = g_new_geomf(&g_swap_class, "swap");
 	cp = g_new_consumer(gp);
 	cp->index = 1;	/* Number of active I/Os, plus one for being active. */
 	cp->flags |=  G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	g_attach(cp, pp);
 
 	/*
 	 * XXX: Every time you think you can improve the margin for
 	 * footshooting, somebody depends on the ability to do so:
 	 * savecore(8) wants to write to our swapdev so we cannot
 	 * set an exclusive count :-(
 	 */
 	error = g_access(cp, 1, 1, 0);
 
 	if (error == 0) {
 		nblks = pp->mediasize / DEV_BSIZE;
 		error = swaponsomething(vp, cp, nblks, swapgeom_strategy,
 		    swapgeom_close, dev2udev(dev),
 		    (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0);
 		if (error != 0)
 			g_access(cp, -1, -1, 0);
 	}
 	if (error != 0) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
 	}
 	return (error);
 }
 
 static int
 swapongeom(struct vnode *vp)
 {
 	int error;
 
 	ASSERT_VOP_ELOCKED(vp, "swapongeom");
 	if (vp->v_type != VCHR || VN_IS_DOOMED(vp)) {
 		error = ENOENT;
 	} else {
 		g_topology_lock();
 		error = swapongeom_locked(vp->v_rdev, vp);
 		g_topology_unlock();
 	}
 	return (error);
 }
 
 /*
  * VNODE backend
  *
  * This is used mainly for network filesystem (read: probably only tested
  * with NFS) swapfiles.
  *
  */
 
 static void
 swapdev_strategy(struct buf *bp, struct swdevt *sp)
 {
 	struct vnode *vp2;
 
 	bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first);
 
 	vp2 = sp->sw_id;
 	vhold(vp2);
 	if (bp->b_iocmd == BIO_WRITE) {
 		vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY);
 		if (bp->b_bufobj)
 			bufobj_wdrop(bp->b_bufobj);
 		bufobj_wref(&vp2->v_bufobj);
 	} else {
 		vn_lock(vp2, LK_SHARED | LK_RETRY);
 	}
 	if (bp->b_bufobj != &vp2->v_bufobj)
 		bp->b_bufobj = &vp2->v_bufobj;
 	bp->b_vp = vp2;
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 	VOP_UNLOCK(vp2);
 }
 
 static void
 swapdev_close(struct thread *td, struct swdevt *sp)
 {
 	struct vnode *vp;
 
 	vp = sp->sw_vp;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td);
 	vput(vp);
 }
 
 static int
 swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
 {
 	struct swdevt *sp;
 	int error;
 
 	ASSERT_VOP_ELOCKED(vp, "swaponvp");
 	if (nblks == 0)
 		return (ENXIO);
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_id == vp) {
 			mtx_unlock(&sw_dev_mtx);
 			return (EBUSY);
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
 
 #ifdef MAC
 	error = mac_system_check_swapon(td->td_ucred, vp);
 	if (error == 0)
 #endif
 		error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL);
 	if (error != 0)
 		return (error);
 
 	error = swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
 	    NODEV, 0);
 	if (error != 0)
 		VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td);
 	return (error);
 }
 
 static int
 sysctl_swap_async_max(SYSCTL_HANDLER_ARGS)
 {
 	int error, new, n;
 
 	new = nsw_wcount_async_max;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (new > nswbuf / 2 || new < 1)
 		return (EINVAL);
 
 	mtx_lock(&swbuf_mtx);
 	while (nsw_wcount_async_max != new) {
 		/*
 		 * Adjust difference.  If the current async count is too low,
 		 * we will need to sqeeze our update slowly in.  Sleep with a
 		 * higher priority than getpbuf() to finish faster.
 		 */
 		n = new - nsw_wcount_async_max;
 		if (nsw_wcount_async + n >= 0) {
 			nsw_wcount_async += n;
 			nsw_wcount_async_max += n;
 			wakeup(&nsw_wcount_async);
 		} else {
 			nsw_wcount_async_max -= nsw_wcount_async;
 			nsw_wcount_async = 0;
 			msleep(&nsw_wcount_async, &swbuf_mtx, PSWP,
 			    "swpsysctl", 0);
 		}
 	}
 	mtx_unlock(&swbuf_mtx);
 
 	return (0);
 }
 
 static void
 swap_pager_update_writecount(vm_object_t object, vm_offset_t start,
     vm_offset_t end)
 {
 
 	VM_OBJECT_WLOCK(object);
 	KASSERT((object->flags & OBJ_ANON) == 0,
 	    ("Splittable object with writecount"));
 	object->un_pager.swp.writemappings += (vm_ooffset_t)end - start;
 	VM_OBJECT_WUNLOCK(object);
 }
 
 static void
 swap_pager_release_writecount(vm_object_t object, vm_offset_t start,
     vm_offset_t end)
 {
 
 	VM_OBJECT_WLOCK(object);
 	KASSERT((object->flags & OBJ_ANON) == 0,
 	    ("Splittable object with writecount"));
 	KASSERT(object->un_pager.swp.writemappings >= (vm_ooffset_t)end - start,
 	    ("swap obj %p writecount %jx dec %jx", object,
 	    (uintmax_t)object->un_pager.swp.writemappings,
 	    (uintmax_t)((vm_ooffset_t)end - start)));
 	object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start;
 	VM_OBJECT_WUNLOCK(object);
 }
diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h
index 3287886026f7..da1457762c0b 100644
--- a/sys/vm/swap_pager.h
+++ b/sys/vm/swap_pager.h
@@ -1,91 +1,92 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef	_VM_SWAP_PAGER_H_
 #define	_VM_SWAP_PAGER_H_
 
 #include <sys/_types.h>
 
 struct buf;
 struct swdevt;
 struct thread;
 typedef void sw_strategy_t(struct buf *, struct swdevt *);
 typedef void sw_close_t(struct thread *, struct swdevt *);
 
 /*
  * Swap device table
  */
 struct swdevt {
 	int	sw_flags;
 	int	sw_nblks;
 	int     sw_used;
 	dev_t	sw_dev;
 	struct vnode *sw_vp;
 	void	*sw_id;
 	__daddr_t sw_first;
 	__daddr_t sw_end;
 	struct blist *sw_blist;
 	TAILQ_ENTRY(swdevt)	sw_list;
 	sw_strategy_t		*sw_strategy;
 	sw_close_t		*sw_close;
 };
 
 #define	SW_UNMAPPED	0x01
 #define	SW_CLOSING	0x04
 
 #ifdef _KERNEL
 
+extern bool swap_pager_almost_full;
 extern int swap_pager_avail;
 extern int nsw_cluster_max;
 
 struct xswdev;
 int swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len);
 void swap_pager_copy(vm_object_t, vm_object_t, vm_pindex_t, int);
 bool swap_pager_scan_all_shadowed(vm_object_t object);
 vm_pindex_t swap_pager_seek_data(vm_object_t object, vm_pindex_t pindex);
 vm_pindex_t swap_pager_seek_hole(vm_object_t object, vm_pindex_t pindex);
 void swap_pager_freespace(vm_object_t object, vm_pindex_t start,
     vm_size_t size, vm_size_t *freed);
 void swap_pager_swap_init(void);
 int swap_pager_nswapdev(void);
 int swap_pager_reserve(vm_object_t, vm_pindex_t, vm_pindex_t);
 void swap_pager_status(int *total, int *used);
 u_long swap_pager_swapped_pages(vm_object_t object);
 void swapoff_all(void);
 bool swap_pager_init_object(vm_object_t object, void *handle,
     struct ucred *cred, vm_ooffset_t size, vm_ooffset_t offset);
 #endif				/* _KERNEL */
 #endif				/* _VM_SWAP_PAGER_H_ */
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index b500eb8156bc..3f1be78342c9 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -1,2416 +1,2449 @@
 /*-
  * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2005 Yahoo! Technologies Norway AS
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	The proverbial page-out daemon.
  */
 
 #include <sys/cdefs.h>
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/blockcount.h>
 #include <sys/eventhandler.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/mount.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_pagequeue.h>
 #include <vm/vm_radix.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 /*
  * System initialization
  */
 
 /* the kernel process "vm_pageout"*/
 static void vm_pageout(void);
 static void vm_pageout_init(void);
 static int vm_pageout_clean(vm_page_t m, int *numpagedout);
 static int vm_pageout_cluster(vm_page_t m);
 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
     int starting_page_shortage);
 
 SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
     NULL);
 
 struct proc *pageproc;
 
 static struct kproc_desc page_kp = {
 	"pagedaemon",
 	vm_pageout,
 	&pageproc
 };
 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
     &page_kp);
 
 SDT_PROVIDER_DEFINE(vm);
 SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
 
 /* Pagedaemon activity rates, in subdivisions of one second. */
 #define	VM_LAUNDER_RATE		10
 #define	VM_INACT_SCAN_RATE	10
 
 static int swapdev_enabled;
 int vm_pageout_page_count = 32;
 
 static int vm_panic_on_oom = 0;
 SYSCTL_INT(_vm, OID_AUTO, panic_on_oom,
     CTLFLAG_RWTUN, &vm_panic_on_oom, 0,
     "Panic on the given number of out-of-memory errors instead of "
     "killing the largest process");
 
 static int vm_pageout_update_period;
 SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
     CTLFLAG_RWTUN, &vm_pageout_update_period, 0,
     "Maximum active LRU update period");
 
 static int pageout_cpus_per_thread = 16;
 SYSCTL_INT(_vm, OID_AUTO, pageout_cpus_per_thread, CTLFLAG_RDTUN,
     &pageout_cpus_per_thread, 0,
     "Number of CPUs per pagedaemon worker thread");
   
 static int lowmem_period = 10;
 SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0,
     "Low memory callback period");
 
 static int disable_swap_pageouts;
 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
     CTLFLAG_RWTUN, &disable_swap_pageouts, 0,
     "Disallow swapout of dirty pages");
 
 static int pageout_lock_miss;
 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
     CTLFLAG_RD, &pageout_lock_miss, 0,
     "vget() lock misses during pageout");
 
 static int vm_pageout_oom_seq = 12;
 SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
     CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0,
     "back-to-back calls to oom detector to start OOM");
 
-static int act_scan_laundry_weight = 3;
-
 static int
-sysctl_act_scan_laundry_weight(SYSCTL_HANDLER_ARGS)
+sysctl_laundry_weight(SYSCTL_HANDLER_ARGS)
 {
-	int error, newval;
+	int error, val;
 
-	newval = act_scan_laundry_weight;
-	error = sysctl_handle_int(oidp, &newval, 0, req);
-	if (error || req->newptr == NULL)
+	val = *(int *)arg1;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
 		return (error);
-	if (newval < 1)
+	if (val < arg2 || val > 100)
 		return (EINVAL);
-	act_scan_laundry_weight = newval;
+	*(int *)arg1 = val;
 	return (0);
 }
-SYSCTL_PROC(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN | CTLTYPE_INT,
-    &act_scan_laundry_weight, 0, sysctl_act_scan_laundry_weight, "I",
+
+static int act_scan_laundry_weight = 3;
+SYSCTL_PROC(_vm, OID_AUTO, act_scan_laundry_weight,
+    CTLTYPE_INT | CTLFLAG_RWTUN, &act_scan_laundry_weight, 1,
+    sysctl_laundry_weight, "I",
     "weight given to clean vs. dirty pages in active queue scans");
 
+static int inact_scan_laundry_weight = 1;
+SYSCTL_PROC(_vm, OID_AUTO, inact_scan_laundry_weight,
+    CTLTYPE_INT | CTLFLAG_RWTUN, &inact_scan_laundry_weight, 0,
+    sysctl_laundry_weight, "I",
+    "weight given to clean vs. dirty pages in inactive queue scans");
+
 static u_int vm_background_launder_rate = 4096;
 SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN,
     &vm_background_launder_rate, 0,
     "background laundering rate, in kilobytes per second");
 
 static u_int vm_background_launder_max = 20 * 1024;
 SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN,
     &vm_background_launder_max, 0,
     "background laundering cap, in kilobytes");
 
 u_long vm_page_max_user_wired;
 SYSCTL_ULONG(_vm, OID_AUTO, max_user_wired, CTLFLAG_RW,
     &vm_page_max_user_wired, 0,
     "system-wide limit to user-wired page count");
 
 static u_int isqrt(u_int num);
 static int vm_pageout_launder(struct vm_domain *vmd, int launder,
     bool in_shortfall);
 static void vm_pageout_laundry_worker(void *arg);
 
 struct scan_state {
 	struct vm_batchqueue bq;
 	struct vm_pagequeue *pq;
 	vm_page_t	marker;
 	int		maxscan;
 	int		scanned;
 };
 
 static void
 vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq,
     vm_page_t marker, vm_page_t after, int maxscan)
 {
 
 	vm_pagequeue_assert_locked(pq);
 	KASSERT((marker->a.flags & PGA_ENQUEUED) == 0,
 	    ("marker %p already enqueued", marker));
 
 	if (after == NULL)
 		TAILQ_INSERT_HEAD(&pq->pq_pl, marker, plinks.q);
 	else
 		TAILQ_INSERT_AFTER(&pq->pq_pl, after, marker, plinks.q);
 	vm_page_aflag_set(marker, PGA_ENQUEUED);
 
 	vm_batchqueue_init(&ss->bq);
 	ss->pq = pq;
 	ss->marker = marker;
 	ss->maxscan = maxscan;
 	ss->scanned = 0;
 	vm_pagequeue_unlock(pq);
 }
 
 static void
 vm_pageout_end_scan(struct scan_state *ss)
 {
 	struct vm_pagequeue *pq;
 
 	pq = ss->pq;
 	vm_pagequeue_assert_locked(pq);
 	KASSERT((ss->marker->a.flags & PGA_ENQUEUED) != 0,
 	    ("marker %p not enqueued", ss->marker));
 
 	TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q);
 	vm_page_aflag_clear(ss->marker, PGA_ENQUEUED);
 	pq->pq_pdpages += ss->scanned;
 }
 
 /*
  * Add a small number of queued pages to a batch queue for later processing
  * without the corresponding queue lock held.  The caller must have enqueued a
  * marker page at the desired start point for the scan.  Pages will be
  * physically dequeued if the caller so requests.  Otherwise, the returned
  * batch may contain marker pages, and it is up to the caller to handle them.
  *
  * When processing the batch queue, vm_pageout_defer() must be used to
  * determine whether the page has been logically dequeued since the batch was
  * collected.
  */
 static __always_inline void
 vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue)
 {
 	struct vm_pagequeue *pq;
 	vm_page_t m, marker, n;
 
 	marker = ss->marker;
 	pq = ss->pq;
 
 	KASSERT((marker->a.flags & PGA_ENQUEUED) != 0,
 	    ("marker %p not enqueued", ss->marker));
 
 	vm_pagequeue_lock(pq);
 	for (m = TAILQ_NEXT(marker, plinks.q); m != NULL &&
 	    ss->scanned < ss->maxscan && ss->bq.bq_cnt < VM_BATCHQUEUE_SIZE;
 	    m = n, ss->scanned++) {
 		n = TAILQ_NEXT(m, plinks.q);
 		if ((m->flags & PG_MARKER) == 0) {
 			KASSERT((m->a.flags & PGA_ENQUEUED) != 0,
 			    ("page %p not enqueued", m));
 			KASSERT((m->flags & PG_FICTITIOUS) == 0,
 			    ("Fictitious page %p cannot be in page queue", m));
 			KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 			    ("Unmanaged page %p cannot be in page queue", m));
 		} else if (dequeue)
 			continue;
 
 		(void)vm_batchqueue_insert(&ss->bq, m);
 		if (dequeue) {
 			TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 			vm_page_aflag_clear(m, PGA_ENQUEUED);
 		}
 	}
 	TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q);
 	if (__predict_true(m != NULL))
 		TAILQ_INSERT_BEFORE(m, marker, plinks.q);
 	else
 		TAILQ_INSERT_TAIL(&pq->pq_pl, marker, plinks.q);
 	if (dequeue)
 		vm_pagequeue_cnt_add(pq, -ss->bq.bq_cnt);
 	vm_pagequeue_unlock(pq);
 }
 
 /*
  * Return the next page to be scanned, or NULL if the scan is complete.
  */
 static __always_inline vm_page_t
 vm_pageout_next(struct scan_state *ss, const bool dequeue)
 {
 
 	if (ss->bq.bq_cnt == 0)
 		vm_pageout_collect_batch(ss, dequeue);
 	return (vm_batchqueue_pop(&ss->bq));
 }
 
 /*
  * Determine whether processing of a page should be deferred and ensure that any
  * outstanding queue operations are processed.
  */
 static __always_inline bool
 vm_pageout_defer(vm_page_t m, const uint8_t queue, const bool enqueued)
 {
 	vm_page_astate_t as;
 
 	as = vm_page_astate_load(m);
 	if (__predict_false(as.queue != queue ||
 	    ((as.flags & PGA_ENQUEUED) != 0) != enqueued))
 		return (true);
 	if ((as.flags & PGA_QUEUE_OP_MASK) != 0) {
 		vm_page_pqbatch_submit(m, queue);
 		return (true);
 	}
 	return (false);
 }
 
 /*
  * We can cluster only if the page is not clean, busy, or held, and the page is
  * in the laundry queue.
  */
 static bool
 vm_pageout_flushable(vm_page_t m)
 {
 	if (vm_page_tryxbusy(m) == 0)
 		return (false);
 	if (!vm_page_wired(m)) {
 		vm_page_test_dirty(m);
 		if (m->dirty != 0 && vm_page_in_laundry(m) &&
 		    vm_page_try_remove_write(m))
 			return (true);
 	}
 	vm_page_xunbusy(m);
 	return (false);
 }
 
 /*
  * Scan for pages at adjacent offsets within the given page's object that are
  * eligible for laundering, form a cluster of these pages and the given page,
  * and launder that cluster.
  */
 static int
 vm_pageout_cluster(vm_page_t m)
 {
 	struct pctrie_iter pages;
 	vm_page_t mc[2 * vm_pageout_page_count - 1];
 	int alignment, page_base, pageout_count;
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 
 	vm_page_assert_xbusied(m);
 
 	vm_page_iter_init(&pages, m->object);
 	alignment = m->pindex % vm_pageout_page_count;
 	page_base = nitems(mc) / 2;
 	pageout_count = 1;
 	mc[page_base] = m;
 
 	/*
 	 * During heavy mmap/modification loads the pageout
 	 * daemon can really fragment the underlying file
 	 * due to flushing pages out of order and not trying to
 	 * align the clusters (which leaves sporadic out-of-order
 	 * holes).  To solve this problem we do the reverse scan
 	 * first and attempt to align our cluster, then do a 
 	 * forward scan if room remains.
 	 *
 	 * If we are at an alignment boundary, stop here, and switch directions.
 	 */
 	if (alignment > 0) {
 		pages.index = mc[page_base]->pindex;
 		do {
 			m = vm_radix_iter_prev(&pages);
 			if (m == NULL || !vm_pageout_flushable(m))
 				break;
 			mc[--page_base] = m;
 		} while (pageout_count++ < alignment);
 	}
 	if (pageout_count < vm_pageout_page_count) {
 		pages.index = mc[page_base + pageout_count - 1]->pindex;
 		do {
 			m = vm_radix_iter_next(&pages);
 			if (m == NULL || !vm_pageout_flushable(m))
 				break;
 			mc[page_base + pageout_count] = m;
 		} while (++pageout_count < vm_pageout_page_count);
 	}
 	if (pageout_count < vm_pageout_page_count &&
 	    alignment == nitems(mc) / 2 - page_base) {
 		/* Resume the reverse scan. */
 		pages.index = mc[page_base]->pindex;
 		do {
 			m = vm_radix_iter_prev(&pages);
 			if (m == NULL || !vm_pageout_flushable(m))
 				break;
 			mc[--page_base] = m;
 		} while (++pageout_count < vm_pageout_page_count);
 	}
 
 	return (vm_pageout_flush(&mc[page_base], pageout_count,
 	    VM_PAGER_PUT_NOREUSE, NULL));
 }
 
 /*
  * vm_pageout_flush() - launder the given pages
  *
  *	The given pages are laundered.  Note that we setup for the start of
  *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
  *	reference count all in here rather then in the parent.  If we want
  *	the parent to do more sophisticated things we may have to change
  *	the ordering.
  *
  *	If eio is not NULL, returns the count of pages between 0 and first page
  *	with status VM_PAGER_AGAIN.  *eio is set to true if pager returned
  *	VM_PAGER_ERROR or VM_PAGER_FAIL for any page in that set.
  *
  *	Otherwise, returns the number of paged-out pages.
  */
 int
 vm_pageout_flush(vm_page_t *mc, int count, int flags, bool *eio)
 {
 	vm_object_t object = mc[0]->object;
 	int pageout_status[count];
 	int numpagedout = 0;
 	int i, runlen;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * Initiate I/O.  Mark the pages shared busy and verify that they're
 	 * valid and read-only.
 	 *
 	 * We do not have to fixup the clean/dirty bits here... we can
 	 * allow the pager to do it after the I/O completes.
 	 *
 	 * NOTE! mc[i]->dirty may be partial or fragmented due to an
 	 * edge case with file fragments.
 	 */
 	for (i = 0; i < count; i++) {
 		KASSERT(vm_page_all_valid(mc[i]),
 		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
 			mc[i], i, count));
 		KASSERT((mc[i]->a.flags & PGA_WRITEABLE) == 0,
 		    ("vm_pageout_flush: writeable page %p", mc[i]));
 		vm_page_busy_downgrade(mc[i]);
 	}
 	vm_object_pip_add(object, count);
 
 	vm_pager_put_pages(object, mc, count, flags, pageout_status);
 
 	runlen = count;
 	if (eio != NULL)
 		*eio = false;
 	for (i = 0; i < count; i++) {
 		vm_page_t mt = mc[i];
 
 		KASSERT(pageout_status[i] == VM_PAGER_PEND ||
 		    !pmap_page_is_write_mapped(mt),
 		    ("vm_pageout_flush: page %p is not write protected", mt));
 		switch (pageout_status[i]) {
 		case VM_PAGER_OK:
 			/*
 			 * The page may have moved since laundering started, in
 			 * which case it should be left alone.
 			 */
 			if (vm_page_in_laundry(mt))
 				vm_page_deactivate_noreuse(mt);
 			/* FALLTHROUGH */
 		case VM_PAGER_PEND:
 			numpagedout++;
 			break;
 		case VM_PAGER_BAD:
 			/*
 			 * The page is outside the object's range.  We pretend
 			 * that the page out worked and clean the page, so the
 			 * changes will be lost if the page is reclaimed by
 			 * the page daemon.
 			 */
 			vm_page_undirty(mt);
 			if (vm_page_in_laundry(mt))
 				vm_page_deactivate_noreuse(mt);
 			break;
 		case VM_PAGER_ERROR:
 		case VM_PAGER_FAIL:
 			/*
 			 * If the page couldn't be paged out to swap because the
 			 * pager wasn't able to find space, place the page in
 			 * the PQ_UNSWAPPABLE holding queue.  This is an
 			 * optimization that prevents the page daemon from
 			 * wasting CPU cycles on pages that cannot be reclaimed
 			 * because no swap device is configured.
 			 *
 			 * Otherwise, reactivate the page so that it doesn't
 			 * clog the laundry and inactive queues.  (We will try
 			 * paging it out again later.)
 			 */
 			if ((object->flags & OBJ_SWAP) != 0 &&
 			    pageout_status[i] == VM_PAGER_FAIL) {
 				vm_page_unswappable(mt);
 				numpagedout++;
 			} else
 				vm_page_activate(mt);
 			if (eio != NULL)
 				*eio = true;
 			break;
 		case VM_PAGER_AGAIN:
 			if (runlen == count)
 				runlen = i;
 			break;
 		}
 
 		/*
 		 * If the operation is still going, leave the page busy to
 		 * block all other accesses. Also, leave the paging in
 		 * progress indicator set so that we don't attempt an object
 		 * collapse.
 		 */
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
 			vm_page_sunbusy(mt);
 		}
 	}
 	if (eio != NULL)
 		return (runlen);
 	return (numpagedout);
 }
 
 static void
 vm_pageout_swapon(void *arg __unused, struct swdevt *sp __unused)
 {
 
 	atomic_store_rel_int(&swapdev_enabled, 1);
 }
 
 static void
 vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused)
 {
 
 	if (swap_pager_nswapdev() == 1)
 		atomic_store_rel_int(&swapdev_enabled, 0);
 }
 
 /*
  * Attempt to acquire all of the necessary locks to launder a page and
  * then call through the clustering layer to PUTPAGES.  Wait a short
  * time for a vnode lock.
  *
  * Requires the page and object lock on entry, releases both before return.
  * Returns 0 on success and an errno otherwise.
  */
 static int
 vm_pageout_clean(vm_page_t m, int *numpagedout)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	vm_object_t object;
 	vm_pindex_t pindex;
 	int error;
 
 	object = m->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	error = 0;
 	vp = NULL;
 	mp = NULL;
 
 	/*
 	 * The object is already known NOT to be dead.   It
 	 * is possible for the vget() to block the whole
 	 * pageout daemon, but the new low-memory handling
 	 * code should prevent it.
 	 *
 	 * We can't wait forever for the vnode lock, we might
 	 * deadlock due to a vn_read() getting stuck in
 	 * vm_wait while holding this vnode.  We skip the 
 	 * vnode if we can't get it in a reasonable amount
 	 * of time.
 	 */
 	if (object->type == OBJT_VNODE) {
 		vm_page_xunbusy(m);
 		vp = object->handle;
 		if (vp->v_type == VREG &&
 		    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 			mp = NULL;
 			error = EDEADLK;
 			goto unlock_all;
 		}
 		KASSERT(mp != NULL,
 		    ("vp %p with NULL v_mount", vp));
 		vm_object_reference_locked(object);
 		pindex = m->pindex;
 		VM_OBJECT_WUNLOCK(object);
 		if (vget(vp, vn_lktype_write(NULL, vp) | LK_TIMELOCK) != 0) {
 			vp = NULL;
 			error = EDEADLK;
 			goto unlock_mp;
 		}
 		VM_OBJECT_WLOCK(object);
 
 		/*
 		 * Ensure that the object and vnode were not disassociated
 		 * while locks were dropped.
 		 */
 		if (vp->v_object != object) {
 			error = ENOENT;
 			goto unlock_all;
 		}
 
 		/*
 		 * While the object was unlocked, the page may have been:
 		 * (1) moved to a different queue,
 		 * (2) reallocated to a different object,
 		 * (3) reallocated to a different offset, or
 		 * (4) cleaned.
 		 */
 		if (!vm_page_in_laundry(m) || m->object != object ||
 		    m->pindex != pindex || m->dirty == 0) {
 			error = ENXIO;
 			goto unlock_all;
 		}
 
 		/*
 		 * The page may have been busied while the object lock was
 		 * released.
 		 */
 		if (vm_page_tryxbusy(m) == 0) {
 			error = EBUSY;
 			goto unlock_all;
 		}
 	}
 
 	/*
 	 * Remove all writeable mappings, failing if the page is wired.
 	 */
 	if (!vm_page_try_remove_write(m)) {
 		vm_page_xunbusy(m);
 		error = EBUSY;
 		goto unlock_all;
 	}
 
 	/*
 	 * If a page is dirty, then it is either being washed
 	 * (but not yet cleaned) or it is still in the
 	 * laundry.  If it is still in the laundry, then we
 	 * start the cleaning operation. 
 	 */
 	if ((*numpagedout = vm_pageout_cluster(m)) == 0)
 		error = EIO;
 
 unlock_all:
 	VM_OBJECT_WUNLOCK(object);
 
 unlock_mp:
 	if (mp != NULL) {
 		if (vp != NULL)
 			vput(vp);
 		vm_object_deallocate(object);
 		vn_finished_write(mp);
 	}
 
 	return (error);
 }
 
 /*
  * Attempt to launder the specified number of pages.
  *
  * Returns the number of pages successfully laundered.
  */
 static int
 vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
 {
 	struct scan_state ss;
 	struct vm_pagequeue *pq;
 	vm_object_t object;
 	vm_page_t m, marker;
 	vm_page_astate_t new, old;
 	int act_delta, error, numpagedout, queue, refs, starting_target;
 	int vnodes_skipped;
 	bool pageout_ok;
 
 	object = NULL;
 	starting_target = launder;
 	vnodes_skipped = 0;
 
 	/*
 	 * Scan the laundry queues for pages eligible to be laundered.  We stop
 	 * once the target number of dirty pages have been laundered, or once
 	 * we've reached the end of the queue.  A single iteration of this loop
 	 * may cause more than one page to be laundered because of clustering.
 	 *
 	 * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no
 	 * swap devices are configured.
 	 */
 	if (atomic_load_acq_int(&swapdev_enabled))
 		queue = PQ_UNSWAPPABLE;
 	else
 		queue = PQ_LAUNDRY;
 
 scan:
 	marker = &vmd->vmd_markers[queue];
 	pq = &vmd->vmd_pagequeues[queue];
 	vm_pagequeue_lock(pq);
 	vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);
 	while (launder > 0 && (m = vm_pageout_next(&ss, false)) != NULL) {
 		if (__predict_false((m->flags & PG_MARKER) != 0))
 			continue;
 
 		/*
 		 * Don't touch a page that was removed from the queue after the
 		 * page queue lock was released.  Otherwise, ensure that any
 		 * pending queue operations, such as dequeues for wired pages,
 		 * are handled.
 		 */
 		if (vm_pageout_defer(m, queue, true))
 			continue;
 
 		/*
 		 * Lock the page's object.
 		 */
 		if (object == NULL || object != m->object) {
 			if (object != NULL)
 				VM_OBJECT_WUNLOCK(object);
 			object = atomic_load_ptr(&m->object);
 			if (__predict_false(object == NULL))
 				/* The page is being freed by another thread. */
 				continue;
 
 			/* Depends on type-stability. */
 			VM_OBJECT_WLOCK(object);
 			if (__predict_false(m->object != object)) {
 				VM_OBJECT_WUNLOCK(object);
 				object = NULL;
 				continue;
 			}
 		}
 
 		if (vm_page_tryxbusy(m) == 0)
 			continue;
 
 		/*
 		 * Check for wirings now that we hold the object lock and have
 		 * exclusively busied the page.  If the page is mapped, it may
 		 * still be wired by pmap lookups.  The call to
 		 * vm_page_try_remove_all() below atomically checks for such
 		 * wirings and removes mappings.  If the page is unmapped, the
 		 * wire count is guaranteed not to increase after this check.
 		 */
 		if (__predict_false(vm_page_wired(m)))
 			goto skip_page;
 
 		/*
 		 * Invalid pages can be easily freed.  They cannot be
 		 * mapped; vm_page_free() asserts this.
 		 */
 		if (vm_page_none_valid(m))
 			goto free_page;
 
 		refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0;
 
 		for (old = vm_page_astate_load(m);;) {
 			/*
 			 * Check to see if the page has been removed from the
 			 * queue since the first such check.  Leave it alone if
 			 * so, discarding any references collected by
 			 * pmap_ts_referenced().
 			 */
 			if (__predict_false(_vm_page_queue(old) == PQ_NONE))
 				goto skip_page;
 
 			new = old;
 			act_delta = refs;
 			if ((old.flags & PGA_REFERENCED) != 0) {
 				new.flags &= ~PGA_REFERENCED;
 				act_delta++;
 			}
 			if (act_delta == 0) {
 				;
 			} else if (object->ref_count != 0) {
 				/*
 				 * Increase the activation count if the page was
 				 * referenced while in the laundry queue.  This
 				 * makes it less likely that the page will be
 				 * returned prematurely to the laundry queue.
 				 */
 				new.act_count += ACT_ADVANCE +
 				    act_delta;
 				if (new.act_count > ACT_MAX)
 					new.act_count = ACT_MAX;
 
 				new.flags &= ~PGA_QUEUE_OP_MASK;
 				new.flags |= PGA_REQUEUE;
 				new.queue = PQ_ACTIVE;
 				if (!vm_page_pqstate_commit(m, &old, new))
 					continue;
 
 				/*
 				 * If this was a background laundering, count
 				 * activated pages towards our target.  The
 				 * purpose of background laundering is to ensure
 				 * that pages are eventually cycled through the
 				 * laundry queue, and an activation is a valid
 				 * way out.
 				 */
 				if (!in_shortfall)
 					launder--;
 				VM_CNT_INC(v_reactivated);
 				goto skip_page;
 			} else if ((object->flags & OBJ_DEAD) == 0) {
 				new.flags |= PGA_REQUEUE;
 				if (!vm_page_pqstate_commit(m, &old, new))
 					continue;
 				goto skip_page;
 			}
 			break;
 		}
 
 		/*
 		 * If the page appears to be clean at the machine-independent
 		 * layer, then remove all of its mappings from the pmap in
 		 * anticipation of freeing it.  If, however, any of the page's
 		 * mappings allow write access, then the page may still be
 		 * modified until the last of those mappings are removed.
 		 */
 		if (object->ref_count != 0) {
 			vm_page_test_dirty(m);
 			if (m->dirty == 0 && !vm_page_try_remove_all(m))
 				goto skip_page;
 		}
 
 		/*
 		 * Clean pages are freed, and dirty pages are paged out unless
 		 * they belong to a dead object.  Requeueing dirty pages from
 		 * dead objects is pointless, as they are being paged out and
 		 * freed by the thread that destroyed the object.
 		 */
 		if (m->dirty == 0) {
 free_page:
 			/*
 			 * Now we are guaranteed that no other threads are
 			 * manipulating the page, check for a last-second
 			 * reference.
 			 */
 			if (vm_pageout_defer(m, queue, true))
 				goto skip_page;
 			vm_page_free(m);
 			VM_CNT_INC(v_dfree);
 		} else if ((object->flags & OBJ_DEAD) == 0) {
 			if ((object->flags & OBJ_SWAP) != 0)
 				pageout_ok = disable_swap_pageouts == 0;
 			else
 				pageout_ok = true;
 			if (!pageout_ok) {
 				vm_page_launder(m);
 				goto skip_page;
 			}
 
 			/*
 			 * Form a cluster with adjacent, dirty pages from the
 			 * same object, and page out that entire cluster.
 			 *
 			 * The adjacent, dirty pages must also be in the
 			 * laundry.  However, their mappings are not checked
 			 * for new references.  Consequently, a recently
 			 * referenced page may be paged out.  However, that
 			 * page will not be prematurely reclaimed.  After page
 			 * out, the page will be placed in the inactive queue,
 			 * where any new references will be detected and the
 			 * page reactivated.
 			 */
 			error = vm_pageout_clean(m, &numpagedout);
 			if (error == 0) {
 				launder -= numpagedout;
 				ss.scanned += numpagedout;
 			} else if (error == EDEADLK) {
 				pageout_lock_miss++;
 				vnodes_skipped++;
 			}
 			object = NULL;
 		} else {
 skip_page:
 			vm_page_xunbusy(m);
 		}
 	}
 	if (object != NULL) {
 		VM_OBJECT_WUNLOCK(object);
 		object = NULL;
 	}
 	vm_pagequeue_lock(pq);
 	vm_pageout_end_scan(&ss);
 	vm_pagequeue_unlock(pq);
 
 	if (launder > 0 && queue == PQ_UNSWAPPABLE) {
 		queue = PQ_LAUNDRY;
 		goto scan;
 	}
 
 	/*
 	 * Wakeup the sync daemon if we skipped a vnode in a writeable object
 	 * and we didn't launder enough pages.
 	 */
 	if (vnodes_skipped > 0 && launder > 0)
 		(void)speedup_syncer();
 
 	return (starting_target - launder);
 }
 
 /*
  * Compute the integer square root.
  */
 static u_int
 isqrt(u_int num)
 {
 	u_int bit, root, tmp;
 
 	bit = num != 0 ? (1u << ((fls(num) - 1) & ~1)) : 0;
 	root = 0;
 	while (bit != 0) {
 		tmp = root + bit;
 		root >>= 1;
 		if (num >= tmp) {
 			num -= tmp;
 			root += bit;
 		}
 		bit >>= 2;
 	}
 	return (root);
 }
 
 /*
  * Perform the work of the laundry thread: periodically wake up and determine
  * whether any pages need to be laundered.  If so, determine the number of pages
  * that need to be laundered, and launder them.
  */
 static void
 vm_pageout_laundry_worker(void *arg)
 {
 	struct vm_domain *vmd;
 	struct vm_pagequeue *pq;
 	uint64_t nclean, ndirty, nfreed;
 	int domain, last_target, launder, shortfall, shortfall_cycle, target;
 	bool in_shortfall;
 
 	domain = (uintptr_t)arg;
 	vmd = VM_DOMAIN(domain);
 	pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
 	KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
 
 	shortfall = 0;
 	in_shortfall = false;
 	shortfall_cycle = 0;
 	last_target = target = 0;
 	nfreed = 0;
 
 	/*
 	 * Calls to these handlers are serialized by the swap syscall lock.
 	 */
 	(void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, vmd,
 	    EVENTHANDLER_PRI_ANY);
 	(void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, vmd,
 	    EVENTHANDLER_PRI_ANY);
 
 	/*
 	 * The pageout laundry worker is never done, so loop forever.
 	 */
 	for (;;) {
 		KASSERT(target >= 0, ("negative target %d", target));
 		KASSERT(shortfall_cycle >= 0,
 		    ("negative cycle %d", shortfall_cycle));
 		launder = 0;
 
 		/*
 		 * First determine whether we need to launder pages to meet a
 		 * shortage of free pages.
 		 */
 		if (shortfall > 0) {
 			in_shortfall = true;
 			shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE;
 			target = shortfall;
 		} else if (!in_shortfall)
 			goto trybackground;
 		else if (shortfall_cycle == 0 || vm_laundry_target(vmd) <= 0) {
 			/*
 			 * We recently entered shortfall and began laundering
 			 * pages.  If we have completed that laundering run
 			 * (and we are no longer in shortfall) or we have met
 			 * our laundry target through other activity, then we
 			 * can stop laundering pages.
 			 */
 			in_shortfall = false;
 			target = 0;
 			goto trybackground;
 		}
 		launder = target / shortfall_cycle--;
 		goto dolaundry;
 
 		/*
 		 * There's no immediate need to launder any pages; see if we
 		 * meet the conditions to perform background laundering:
 		 *
 		 * 1. The ratio of dirty to clean inactive pages exceeds the
 		 *    background laundering threshold, or
 		 * 2. we haven't yet reached the target of the current
 		 *    background laundering run.
 		 *
 		 * The background laundering threshold is not a constant.
 		 * Instead, it is a slowly growing function of the number of
 		 * clean pages freed by the page daemon since the last
 		 * background laundering.  Thus, as the ratio of dirty to
 		 * clean inactive pages grows, the amount of memory pressure
 		 * required to trigger laundering decreases.  We ensure
 		 * that the threshold is non-zero after an inactive queue
 		 * scan, even if that scan failed to free a single clean page.
 		 */
 trybackground:
 		nclean = vmd->vmd_free_count +
 		    vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt;
 		ndirty = vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt;
 		if (target == 0 && ndirty * isqrt(howmany(nfreed + 1,
 		    vmd->vmd_free_target - vmd->vmd_free_min)) >= nclean) {
 			target = vmd->vmd_background_launder_target;
 		}
 
 		/*
 		 * We have a non-zero background laundering target.  If we've
 		 * laundered up to our maximum without observing a page daemon
 		 * request, just stop.  This is a safety belt that ensures we
 		 * don't launder an excessive amount if memory pressure is low
 		 * and the ratio of dirty to clean pages is large.  Otherwise,
 		 * proceed at the background laundering rate.
 		 */
 		if (target > 0) {
 			if (nfreed > 0) {
 				nfreed = 0;
 				last_target = target;
 			} else if (last_target - target >=
 			    vm_background_launder_max * PAGE_SIZE / 1024) {
 				target = 0;
 			}
 			launder = vm_background_launder_rate * PAGE_SIZE / 1024;
 			launder /= VM_LAUNDER_RATE;
 			if (launder > target)
 				launder = target;
 		}
 
 dolaundry:
 		if (launder > 0) {
 			/*
 			 * Because of I/O clustering, the number of laundered
 			 * pages could exceed "target" by the maximum size of
 			 * a cluster minus one. 
 			 */
 			target -= min(vm_pageout_launder(vmd, launder,
 			    in_shortfall), target);
 			pause("laundp", hz / VM_LAUNDER_RATE);
 		}
 
 		/*
 		 * If we're not currently laundering pages and the page daemon
 		 * hasn't posted a new request, sleep until the page daemon
 		 * kicks us.
 		 */
 		vm_pagequeue_lock(pq);
 		if (target == 0 && vmd->vmd_laundry_request == VM_LAUNDRY_IDLE)
 			(void)mtx_sleep(&vmd->vmd_laundry_request,
 			    vm_pagequeue_lockptr(pq), PVM, "launds", 0);
 
 		/*
 		 * If the pagedaemon has indicated that it's in shortfall, start
 		 * a shortfall laundering unless we're already in the middle of
 		 * one.  This may preempt a background laundering.
 		 */
 		if (vmd->vmd_laundry_request == VM_LAUNDRY_SHORTFALL &&
 		    (!in_shortfall || shortfall_cycle == 0)) {
 			shortfall = vm_laundry_target(vmd) +
 			    vmd->vmd_pageout_deficit;
 			target = 0;
 		} else
 			shortfall = 0;
 
 		if (target == 0)
 			vmd->vmd_laundry_request = VM_LAUNDRY_IDLE;
 		nfreed += vmd->vmd_clean_pages_freed;
 		vmd->vmd_clean_pages_freed = 0;
 		vm_pagequeue_unlock(pq);
 	}
 }
 
 /*
  * Compute the number of pages we want to try to move from the
  * active queue to either the inactive or laundry queue.
  *
  * When scanning active pages during a shortage, we make clean pages
  * count more heavily towards the page shortage than dirty pages.
  * This is because dirty pages must be laundered before they can be
  * reused and thus have less utility when attempting to quickly
  * alleviate a free page shortage.  However, this weighting also
  * causes the scan to deactivate dirty pages more aggressively,
  * improving the effectiveness of clustering.
  */
 static int
 vm_pageout_active_target(struct vm_domain *vmd)
 {
 	int shortage;
 
 	shortage = vmd->vmd_inactive_target + vm_paging_target(vmd) -
 	    (vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt +
 	    vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt / act_scan_laundry_weight);
 	shortage *= act_scan_laundry_weight;
 	return (shortage);
 }
 
 /*
  * Scan the active queue.  If there is no shortage of inactive pages, scan a
  * small portion of the queue in order to maintain quasi-LRU.
  */
 static void
 vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage)
 {
 	struct scan_state ss;
 	vm_object_t object;
 	vm_page_t m, marker;
 	struct vm_pagequeue *pq;
 	vm_page_astate_t old, new;
 	long min_scan;
 	int act_delta, max_scan, ps_delta, refs, scan_tick;
 	uint8_t nqueue;
 
 	marker = &vmd->vmd_markers[PQ_ACTIVE];
 	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
 	vm_pagequeue_lock(pq);
 
 	/*
 	 * If we're just idle polling attempt to visit every
 	 * active page within 'update_period' seconds.
 	 */
 	scan_tick = ticks;
 	if (vm_pageout_update_period != 0) {
 		min_scan = pq->pq_cnt;
 		min_scan *= scan_tick - vmd->vmd_last_active_scan;
 		min_scan /= hz * vm_pageout_update_period;
 	} else
 		min_scan = 0;
 	if (min_scan > 0 || (page_shortage > 0 && pq->pq_cnt > 0))
 		vmd->vmd_last_active_scan = scan_tick;
 
 	/*
 	 * Scan the active queue for pages that can be deactivated.  Update
 	 * the per-page activity counter and use it to identify deactivation
 	 * candidates.  Held pages may be deactivated.
 	 *
 	 * To avoid requeuing each page that remains in the active queue, we
 	 * implement the CLOCK algorithm.  To keep the implementation of the
 	 * enqueue operation consistent for all page queues, we use two hands,
 	 * represented by marker pages. Scans begin at the first hand, which
 	 * precedes the second hand in the queue.  When the two hands meet,
 	 * they are moved back to the head and tail of the queue, respectively,
 	 * and scanning resumes.
 	 */
 	max_scan = page_shortage > 0 ? pq->pq_cnt : min_scan;
 act_scan:
 	vm_pageout_init_scan(&ss, pq, marker, &vmd->vmd_clock[0], max_scan);
 	while ((m = vm_pageout_next(&ss, false)) != NULL) {
 		if (__predict_false(m == &vmd->vmd_clock[1])) {
 			vm_pagequeue_lock(pq);
 			TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
 			TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q);
 			TAILQ_INSERT_HEAD(&pq->pq_pl, &vmd->vmd_clock[0],
 			    plinks.q);
 			TAILQ_INSERT_TAIL(&pq->pq_pl, &vmd->vmd_clock[1],
 			    plinks.q);
 			max_scan -= ss.scanned;
 			vm_pageout_end_scan(&ss);
 			goto act_scan;
 		}
 		if (__predict_false((m->flags & PG_MARKER) != 0))
 			continue;
 
 		/*
 		 * Don't touch a page that was removed from the queue after the
 		 * page queue lock was released.  Otherwise, ensure that any
 		 * pending queue operations, such as dequeues for wired pages,
 		 * are handled.
 		 */
 		if (vm_pageout_defer(m, PQ_ACTIVE, true))
 			continue;
 
 		/*
 		 * A page's object pointer may be set to NULL before
 		 * the object lock is acquired.
 		 */
 		object = atomic_load_ptr(&m->object);
 		if (__predict_false(object == NULL))
 			/*
 			 * The page has been removed from its object.
 			 */
 			continue;
 
 		/* Deferred free of swap space. */
 		if ((m->a.flags & PGA_SWAP_FREE) != 0 &&
 		    VM_OBJECT_TRYWLOCK(object)) {
 			if (m->object == object)
 				vm_pager_page_unswapped(m);
 			VM_OBJECT_WUNLOCK(object);
 		}
 
 		/*
 		 * Check to see "how much" the page has been used.
 		 *
 		 * Test PGA_REFERENCED after calling pmap_ts_referenced() so
 		 * that a reference from a concurrently destroyed mapping is
 		 * observed here and now.
 		 *
 		 * Perform an unsynchronized object ref count check.  While
 		 * the page lock ensures that the page is not reallocated to
 		 * another object, in particular, one with unmanaged mappings
 		 * that cannot support pmap_ts_referenced(), two races are,
 		 * nonetheless, possible:
 		 * 1) The count was transitioning to zero, but we saw a non-
 		 *    zero value.  pmap_ts_referenced() will return zero
 		 *    because the page is not mapped.
 		 * 2) The count was transitioning to one, but we saw zero.
 		 *    This race delays the detection of a new reference.  At
 		 *    worst, we will deactivate and reactivate the page.
 		 */
 		refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0;
 
 		old = vm_page_astate_load(m);
 		do {
 			/*
 			 * Check to see if the page has been removed from the
 			 * queue since the first such check.  Leave it alone if
 			 * so, discarding any references collected by
 			 * pmap_ts_referenced().
 			 */
 			if (__predict_false(_vm_page_queue(old) == PQ_NONE)) {
 				ps_delta = 0;
 				break;
 			}
 
 			/*
 			 * Advance or decay the act_count based on recent usage.
 			 */
 			new = old;
 			act_delta = refs;
 			if ((old.flags & PGA_REFERENCED) != 0) {
 				new.flags &= ~PGA_REFERENCED;
 				act_delta++;
 			}
 			if (act_delta != 0) {
 				new.act_count += ACT_ADVANCE + act_delta;
 				if (new.act_count > ACT_MAX)
 					new.act_count = ACT_MAX;
 			} else {
 				new.act_count -= min(new.act_count,
 				    ACT_DECLINE);
 			}
 
 			if (new.act_count > 0) {
 				/*
 				 * Adjust the activation count and keep the page
 				 * in the active queue.  The count might be left
 				 * unchanged if it is saturated.  The page may
 				 * have been moved to a different queue since we
 				 * started the scan, in which case we move it
 				 * back.
 				 */
 				ps_delta = 0;
 				if (old.queue != PQ_ACTIVE) {
 					new.flags &= ~PGA_QUEUE_OP_MASK;
 					new.flags |= PGA_REQUEUE;
 					new.queue = PQ_ACTIVE;
 				}
 			} else {
 				/*
 				 * When not short for inactive pages, let dirty
 				 * pages go through the inactive queue before
 				 * moving to the laundry queue.  This gives them
 				 * some extra time to be reactivated,
 				 * potentially avoiding an expensive pageout.
 				 * However, during a page shortage, the inactive
 				 * queue is necessarily small, and so dirty
 				 * pages would only spend a trivial amount of
 				 * time in the inactive queue.  Therefore, we
 				 * might as well place them directly in the
 				 * laundry queue to reduce queuing overhead.
 				 *
 				 * Calling vm_page_test_dirty() here would
 				 * require acquisition of the object's write
 				 * lock.  However, during a page shortage,
 				 * directing dirty pages into the laundry queue
 				 * is only an optimization and not a
 				 * requirement.  Therefore, we simply rely on
 				 * the opportunistic updates to the page's dirty
 				 * field by the pmap.
 				 */
 				if (page_shortage <= 0) {
 					nqueue = PQ_INACTIVE;
 					ps_delta = 0;
 				} else if (m->dirty == 0) {
 					nqueue = PQ_INACTIVE;
 					ps_delta = act_scan_laundry_weight;
 				} else {
 					nqueue = PQ_LAUNDRY;
 					ps_delta = 1;
 				}
 
 				new.flags &= ~PGA_QUEUE_OP_MASK;
 				new.flags |= PGA_REQUEUE;
 				new.queue = nqueue;
 			}
 		} while (!vm_page_pqstate_commit(m, &old, new));
 
 		page_shortage -= ps_delta;
 	}
 	vm_pagequeue_lock(pq);
 	TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
 	TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q);
 	vm_pageout_end_scan(&ss);
 	vm_pagequeue_unlock(pq);
 }
 
 static int
 vm_pageout_reinsert_inactive_page(struct vm_pagequeue *pq, vm_page_t marker,
     vm_page_t m)
 {
 	vm_page_astate_t as;
 
 	vm_pagequeue_assert_locked(pq);
 
 	as = vm_page_astate_load(m);
 	if (as.queue != PQ_INACTIVE || (as.flags & PGA_ENQUEUED) != 0)
 		return (0);
 	vm_page_aflag_set(m, PGA_ENQUEUED);
 	TAILQ_INSERT_BEFORE(marker, m, plinks.q);
 	return (1);
 }
 
 /*
  * Re-add stuck pages to the inactive queue.  We will examine them again
  * during the next scan.  If the queue state of a page has changed since
  * it was physically removed from the page queue in
  * vm_pageout_collect_batch(), don't do anything with that page.
  */
 static void
 vm_pageout_reinsert_inactive(struct scan_state *ss, struct vm_batchqueue *bq,
     vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 	vm_page_t marker;
 	int delta;
 
 	delta = 0;
 	marker = ss->marker;
 	pq = ss->pq;
 
 	if (m != NULL) {
 		if (vm_batchqueue_insert(bq, m) != 0)
 			return;
 		vm_pagequeue_lock(pq);
 		delta += vm_pageout_reinsert_inactive_page(pq, marker, m);
 	} else
 		vm_pagequeue_lock(pq);
 	while ((m = vm_batchqueue_pop(bq)) != NULL)
 		delta += vm_pageout_reinsert_inactive_page(pq, marker, m);
 	vm_pagequeue_cnt_add(pq, delta);
 	vm_pagequeue_unlock(pq);
 	vm_batchqueue_init(bq);
 }
 
 static void
 vm_pageout_scan_inactive(struct vm_domain *vmd, int page_shortage)
 {
 	struct timeval start, end;
 	struct scan_state ss;
 	struct vm_batchqueue rq;
 	struct vm_page marker_page;
 	vm_page_t m, marker;
 	struct vm_pagequeue *pq;
 	vm_object_t object;
 	vm_page_astate_t old, new;
-	int act_delta, addl_page_shortage, starting_page_shortage, refs;
+	int act_delta, addl_page_shortage, dirty_count, dirty_thresh;
+	int starting_page_shortage, refs;
 
 	object = NULL;
 	vm_batchqueue_init(&rq);
 	getmicrouptime(&start);
 
 	/*
 	 * The addl_page_shortage is an estimate of the number of temporarily
 	 * stuck pages in the inactive queue.  In other words, the
 	 * number of pages from the inactive count that should be
 	 * discounted in setting the target for the active queue scan.
 	 */
 	addl_page_shortage = 0;
 
+	/*
+	 * dirty_count is the number of pages encountered that require
+	 * laundering before reclamation is possible.  If we encounter a large
+	 * number of dirty pages, we may abort the scan without meeting the page
+	 * shortage in the hope that laundering will allow a future scan to meet
+	 * the target.
+	 */
+	dirty_count = 0;
+	dirty_thresh = inact_scan_laundry_weight * page_shortage;
+	if (dirty_thresh == 0)
+		dirty_thresh = INT_MAX;
+
 	/*
 	 * Start scanning the inactive queue for pages that we can free.  The
 	 * scan will stop when we reach the target or we have scanned the
 	 * entire queue.  (Note that m->a.act_count is not used to make
 	 * decisions for the inactive queue, only for the active queue.)
 	 */
 	starting_page_shortage = page_shortage;
 	marker = &marker_page;
 	vm_page_init_marker(marker, PQ_INACTIVE, 0);
 	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
 	vm_pagequeue_lock(pq);
 	vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);
-	while (page_shortage > 0) {
+	while (page_shortage > 0 && dirty_count < dirty_thresh) {
 		/*
 		 * If we need to refill the scan batch queue, release any
 		 * optimistically held object lock.  This gives someone else a
 		 * chance to grab the lock, and also avoids holding it while we
 		 * do unrelated work.
 		 */
 		if (object != NULL && vm_batchqueue_empty(&ss.bq)) {
 			VM_OBJECT_WUNLOCK(object);
 			object = NULL;
 		}
 
 		m = vm_pageout_next(&ss, true);
 		if (m == NULL)
 			break;
 		KASSERT((m->flags & PG_MARKER) == 0,
 		    ("marker page %p was dequeued", m));
 
 		/*
 		 * Don't touch a page that was removed from the queue after the
 		 * page queue lock was released.  Otherwise, ensure that any
 		 * pending queue operations, such as dequeues for wired pages,
 		 * are handled.
 		 */
 		if (vm_pageout_defer(m, PQ_INACTIVE, false))
 			continue;
 
 		/*
 		 * Lock the page's object.
 		 */
 		if (object == NULL || object != m->object) {
 			if (object != NULL)
 				VM_OBJECT_WUNLOCK(object);
 			object = atomic_load_ptr(&m->object);
 			if (__predict_false(object == NULL))
 				/* The page is being freed by another thread. */
 				continue;
 
 			/* Depends on type-stability. */
 			VM_OBJECT_WLOCK(object);
 			if (__predict_false(m->object != object)) {
 				VM_OBJECT_WUNLOCK(object);
 				object = NULL;
 				goto reinsert;
 			}
 		}
 
 		if (vm_page_tryxbusy(m) == 0) {
 			/*
 			 * Don't mess with busy pages.  Leave them at
 			 * the front of the queue.  Most likely, they
 			 * are being paged out and will leave the
 			 * queue shortly after the scan finishes.  So,
 			 * they ought to be discounted from the
 			 * inactive count.
 			 */
 			addl_page_shortage++;
 			goto reinsert;
 		}
 
 		/* Deferred free of swap space. */
 		if ((m->a.flags & PGA_SWAP_FREE) != 0)
 			vm_pager_page_unswapped(m);
 
 		/*
 		 * Check for wirings now that we hold the object lock and have
 		 * exclusively busied the page.  If the page is mapped, it may
 		 * still be wired by pmap lookups.  The call to
 		 * vm_page_try_remove_all() below atomically checks for such
 		 * wirings and removes mappings.  If the page is unmapped, the
 		 * wire count is guaranteed not to increase after this check.
 		 */
 		if (__predict_false(vm_page_wired(m)))
 			goto skip_page;
 
 		/*
 		 * Invalid pages can be easily freed. They cannot be
 		 * mapped, vm_page_free() asserts this.
 		 */
 		if (vm_page_none_valid(m))
 			goto free_page;
 
 		refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0;
 
 		for (old = vm_page_astate_load(m);;) {
 			/*
 			 * Check to see if the page has been removed from the
 			 * queue since the first such check.  Leave it alone if
 			 * so, discarding any references collected by
 			 * pmap_ts_referenced().
 			 */
 			if (__predict_false(_vm_page_queue(old) == PQ_NONE))
 				goto skip_page;
 
 			new = old;
 			act_delta = refs;
 			if ((old.flags & PGA_REFERENCED) != 0) {
 				new.flags &= ~PGA_REFERENCED;
 				act_delta++;
 			}
 			if (act_delta == 0) {
 				;
 			} else if (object->ref_count != 0) {
 				/*
 				 * Increase the activation count if the
 				 * page was referenced while in the
 				 * inactive queue.  This makes it less
 				 * likely that the page will be returned
 				 * prematurely to the inactive queue.
 				 */
 				new.act_count += ACT_ADVANCE +
 				    act_delta;
 				if (new.act_count > ACT_MAX)
 					new.act_count = ACT_MAX;
 
 				new.flags &= ~PGA_QUEUE_OP_MASK;
 				new.flags |= PGA_REQUEUE;
 				new.queue = PQ_ACTIVE;
 				if (!vm_page_pqstate_commit(m, &old, new))
 					continue;
 
 				VM_CNT_INC(v_reactivated);
 				goto skip_page;
 			} else if ((object->flags & OBJ_DEAD) == 0) {
 				new.queue = PQ_INACTIVE;
 				new.flags |= PGA_REQUEUE;
 				if (!vm_page_pqstate_commit(m, &old, new))
 					continue;
 				goto skip_page;
 			}
 			break;
 		}
 
 		/*
 		 * If the page appears to be clean at the machine-independent
 		 * layer, then remove all of its mappings from the pmap in
 		 * anticipation of freeing it.  If, however, any of the page's
 		 * mappings allow write access, then the page may still be
 		 * modified until the last of those mappings are removed.
 		 */
 		if (object->ref_count != 0) {
 			vm_page_test_dirty(m);
 			if (m->dirty == 0 && !vm_page_try_remove_all(m))
 				goto skip_page;
 		}
 
 		/*
 		 * Clean pages can be freed, but dirty pages must be sent back
 		 * to the laundry, unless they belong to a dead object.
 		 * Requeueing dirty pages from dead objects is pointless, as
 		 * they are being paged out and freed by the thread that
 		 * destroyed the object.
 		 */
 		if (m->dirty == 0) {
 free_page:
 			/*
 			 * Now we are guaranteed that no other threads are
 			 * manipulating the page, check for a last-second
 			 * reference that would save it from doom.
 			 */
 			if (vm_pageout_defer(m, PQ_INACTIVE, false))
 				goto skip_page;
 
 			/*
 			 * Because we dequeued the page and have already checked
 			 * for pending dequeue and enqueue requests, we can
 			 * safely disassociate the page from the inactive queue
 			 * without holding the queue lock.
 			 */
 			m->a.queue = PQ_NONE;
 			vm_page_free(m);
 			page_shortage--;
 			continue;
 		}
-		if ((object->flags & OBJ_DEAD) == 0)
+		if ((object->flags & OBJ_DEAD) == 0) {
 			vm_page_launder(m);
+
+			/*
+			 * If the page would be paged out to a swap device, and
+			 * no devices are configured or they are all nearly
+			 * full, then don't count it against our threshold,
+			 * since it most likely can't be used to meet our
+			 * target.
+			 */
+			if ((object->flags & OBJ_SWAP) == 0 ||
+			    !atomic_load_bool(&swap_pager_almost_full))
+				dirty_count++;
+		}
 skip_page:
 		vm_page_xunbusy(m);
 		continue;
 reinsert:
 		vm_pageout_reinsert_inactive(&ss, &rq, m);
 	}
 	if (object != NULL)
 		VM_OBJECT_WUNLOCK(object);
 	vm_pageout_reinsert_inactive(&ss, &rq, NULL);
 	vm_pageout_reinsert_inactive(&ss, &ss.bq, NULL);
 	vm_pagequeue_lock(pq);
 	vm_pageout_end_scan(&ss);
 	vm_pagequeue_unlock(pq);
 
 	/*
 	 * Record the remaining shortage and the progress and rate it was made.
 	 */
 	atomic_add_int(&vmd->vmd_addl_shortage, addl_page_shortage);
 	getmicrouptime(&end);
 	timevalsub(&end, &start);
 	atomic_add_int(&vmd->vmd_inactive_us,
 	    end.tv_sec * 1000000 + end.tv_usec);
 	atomic_add_int(&vmd->vmd_inactive_freed,
 	    starting_page_shortage - page_shortage);
 }
 
 /*
  * Dispatch a number of inactive threads according to load and collect the
  * results to present a coherent view of paging activity on this domain.
  */
 static int
 vm_pageout_inactive_dispatch(struct vm_domain *vmd, int shortage)
 {
 	u_int freed, pps, slop, threads, us;
 
 	vmd->vmd_inactive_shortage = shortage;
 	slop = 0;
 
 	/*
 	 * If we have more work than we can do in a quarter of our interval, we
 	 * fire off multiple threads to process it.
 	 */
 	if ((threads = vmd->vmd_inactive_threads) > 1 &&
 	    vmd->vmd_helper_threads_enabled &&
 	    vmd->vmd_inactive_pps != 0 &&
 	    shortage > vmd->vmd_inactive_pps / VM_INACT_SCAN_RATE / 4) {
 		vmd->vmd_inactive_shortage /= threads;
 		slop = shortage % threads;
 		vm_domain_pageout_lock(vmd);
 		blockcount_acquire(&vmd->vmd_inactive_starting, threads - 1);
 		blockcount_acquire(&vmd->vmd_inactive_running, threads - 1);
 		wakeup(&vmd->vmd_inactive_shortage);
 		vm_domain_pageout_unlock(vmd);
 	}
 
 	/* Run the local thread scan. */
 	vm_pageout_scan_inactive(vmd, vmd->vmd_inactive_shortage + slop);
 
 	/*
 	 * Block until helper threads report results and then accumulate
 	 * totals.
 	 */
 	blockcount_wait(&vmd->vmd_inactive_running, NULL, "vmpoid", PVM);
 	freed = atomic_readandclear_int(&vmd->vmd_inactive_freed);
 	VM_CNT_ADD(v_dfree, freed);
 
 	/*
 	 * Calculate the per-thread paging rate with an exponential decay of
 	 * prior results.  Careful to avoid integer rounding errors with large
 	 * us values.
 	 */
 	us = max(atomic_readandclear_int(&vmd->vmd_inactive_us), 1);
 	if (us > 1000000)
 		/* Keep rounding to tenths */
 		pps = (freed * 10) / ((us * 10) / 1000000);
 	else
 		pps = (1000000 / us) * freed;
 	vmd->vmd_inactive_pps = (vmd->vmd_inactive_pps / 2) + (pps / 2);
 
 	return (shortage - freed);
 }
 
 /*
  * Attempt to reclaim the requested number of pages from the inactive queue.
  * Returns true if the shortage was addressed.
  */
 static int
 vm_pageout_inactive(struct vm_domain *vmd, int shortage, int *addl_shortage)
 {
 	struct vm_pagequeue *pq;
 	u_int addl_page_shortage, deficit, page_shortage;
 	u_int starting_page_shortage;
 
 	/*
 	 * vmd_pageout_deficit counts the number of pages requested in
 	 * allocations that failed because of a free page shortage.  We assume
 	 * that the allocations will be reattempted and thus include the deficit
 	 * in our scan target.
 	 */
 	deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit);
 	starting_page_shortage = shortage + deficit;
 
 	/*
 	 * Run the inactive scan on as many threads as is necessary.
 	 */
 	page_shortage = vm_pageout_inactive_dispatch(vmd, starting_page_shortage);
 	addl_page_shortage = atomic_readandclear_int(&vmd->vmd_addl_shortage);
 
 	/*
 	 * Wake up the laundry thread so that it can perform any needed
 	 * laundering.  If we didn't meet our target, we're in shortfall and
 	 * need to launder more aggressively.  If PQ_LAUNDRY is empty and no
 	 * swap devices are configured, the laundry thread has no work to do, so
 	 * don't bother waking it up.
 	 *
 	 * The laundry thread uses the number of inactive queue scans elapsed
 	 * since the last laundering to determine whether to launder again, so
 	 * keep count.
 	 */
 	if (starting_page_shortage > 0) {
 		pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
 		vm_pagequeue_lock(pq);
 		if (vmd->vmd_laundry_request == VM_LAUNDRY_IDLE &&
 		    (pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled))) {
 			if (page_shortage > 0) {
 				vmd->vmd_laundry_request = VM_LAUNDRY_SHORTFALL;
 				VM_CNT_INC(v_pdshortfalls);
 			} else if (vmd->vmd_laundry_request !=
 			    VM_LAUNDRY_SHORTFALL)
 				vmd->vmd_laundry_request =
 				    VM_LAUNDRY_BACKGROUND;
 			wakeup(&vmd->vmd_laundry_request);
 		}
 		vmd->vmd_clean_pages_freed +=
 		    starting_page_shortage - page_shortage;
 		vm_pagequeue_unlock(pq);
 	}
 
 	/*
 	 * If the inactive queue scan fails repeatedly to meet its
 	 * target, kill the largest process.
 	 */
 	vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage);
 
 	/*
 	 * See the description of addl_page_shortage above.
 	 */
 	*addl_shortage = addl_page_shortage + deficit;
 
 	return (page_shortage <= 0);
 }
 
 static int vm_pageout_oom_vote;
 
 /*
  * The pagedaemon threads randlomly select one to perform the
  * OOM.  Trying to kill processes before all pagedaemons
  * failed to reach free target is premature.
  */
 static void
 vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
     int starting_page_shortage)
 {
 	int old_vote;
 
 	/*
 	 * Do not trigger an OOM kill if the page daemon is able to make
 	 * progress, or if there is no instantaneous shortage.  The latter case
 	 * can happen if the PID controller is still reacting to an acute
 	 * shortage, and the inactive queue is full of dirty pages.
 	 */
 	if (starting_page_shortage <= 0 || starting_page_shortage !=
 	    page_shortage || !vm_paging_needed(vmd, vmd->vmd_free_count))
 		vmd->vmd_oom_seq = 0;
 	else
 		vmd->vmd_oom_seq++;
 	if (vmd->vmd_oom_seq < vm_pageout_oom_seq) {
 		if (vmd->vmd_oom) {
 			vmd->vmd_oom = false;
 			atomic_subtract_int(&vm_pageout_oom_vote, 1);
 		}
 		return;
 	}
 
 	/*
 	 * Do not follow the call sequence until OOM condition is
 	 * cleared.
 	 */
 	vmd->vmd_oom_seq = 0;
 
 	if (vmd->vmd_oom)
 		return;
 
 	vmd->vmd_oom = true;
 	old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
 	if (old_vote != vm_ndomains - 1)
 		return;
 
 	/*
 	 * The current pagedaemon thread is the last in the quorum to
 	 * start OOM.  Initiate the selection and signaling of the
 	 * victim.
 	 */
 	vm_pageout_oom(VM_OOM_MEM);
 
 	/*
 	 * After one round of OOM terror, recall our vote.  On the
 	 * next pass, current pagedaemon would vote again if the low
 	 * memory condition is still there, due to vmd_oom being
 	 * false.
 	 */
 	vmd->vmd_oom = false;
 	atomic_subtract_int(&vm_pageout_oom_vote, 1);
 }
 
 /*
  * The OOM killer is the page daemon's action of last resort when
  * memory allocation requests have been stalled for a prolonged period
  * of time because it cannot reclaim memory.  This function computes
  * the approximate number of physical pages that could be reclaimed if
  * the specified address space is destroyed.
  *
  * Private, anonymous memory owned by the address space is the
  * principal resource that we expect to recover after an OOM kill.
  * Since the physical pages mapped by the address space's COW entries
  * are typically shared pages, they are unlikely to be released and so
  * they are not counted.
  *
  * To get to the point where the page daemon runs the OOM killer, its
  * efforts to write-back vnode-backed pages may have stalled.  This
  * could be caused by a memory allocation deadlock in the write path
  * that might be resolved by an OOM kill.  Therefore, physical pages
  * belonging to vnode-backed objects are counted, because they might
  * be freed without being written out first if the address space holds
  * the last reference to an unlinked vnode.
  *
  * Similarly, physical pages belonging to OBJT_PHYS objects are
  * counted because the address space might hold the last reference to
  * the object.
  */
 static long
 vm_pageout_oom_pagecount(struct vmspace *vmspace)
 {
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_object_t obj;
 	long res;
 
 	map = &vmspace->vm_map;
 	KASSERT(!vm_map_is_system(map), ("system map"));
 	sx_assert(&map->lock, SA_LOCKED);
 	res = 0;
 	VM_MAP_ENTRY_FOREACH(entry, map) {
 		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
 			continue;
 		obj = entry->object.vm_object;
 		if (obj == NULL)
 			continue;
 		if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 &&
 		    obj->ref_count != 1)
 			continue;
 		if (obj->type == OBJT_PHYS || obj->type == OBJT_VNODE ||
 		    (obj->flags & OBJ_SWAP) != 0)
 			res += obj->resident_page_count;
 	}
 	return (res);
 }
 
 static int vm_oom_ratelim_last;
 static int vm_oom_pf_secs = 10;
 SYSCTL_INT(_vm, OID_AUTO, oom_pf_secs, CTLFLAG_RWTUN, &vm_oom_pf_secs, 0,
     "");
 static struct mtx vm_oom_ratelim_mtx;
 
 void
 vm_pageout_oom(int shortage)
 {
 	const char *reason;
 	struct proc *p, *bigproc;
 	vm_offset_t size, bigsize;
 	struct thread *td;
 	struct vmspace *vm;
 	int now;
 	bool breakout;
 
 	/*
 	 * For OOM requests originating from vm_fault(), there is a high
 	 * chance that a single large process faults simultaneously in
 	 * several threads.  Also, on an active system running many
 	 * processes of middle-size, like buildworld, all of them
 	 * could fault almost simultaneously as well.
 	 *
 	 * To avoid killing too many processes, rate-limit OOMs
 	 * initiated by vm_fault() time-outs on the waits for free
 	 * pages.
 	 */
 	mtx_lock(&vm_oom_ratelim_mtx);
 	now = ticks;
 	if (shortage == VM_OOM_MEM_PF &&
 	    (u_int)(now - vm_oom_ratelim_last) < hz * vm_oom_pf_secs) {
 		mtx_unlock(&vm_oom_ratelim_mtx);
 		return;
 	}
 	vm_oom_ratelim_last = now;
 	mtx_unlock(&vm_oom_ratelim_mtx);
 
 	/*
 	 * We keep the process bigproc locked once we find it to keep anyone
 	 * from messing with it; however, there is a possibility of
 	 * deadlock if process B is bigproc and one of its child processes
 	 * attempts to propagate a signal to B while we are waiting for A's
 	 * lock while walking this list.  To avoid this, we don't block on
 	 * the process lock but just skip a process if it is already locked.
 	 */
 	bigproc = NULL;
 	bigsize = 0;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 
 		/*
 		 * If this is a system, protected or killed process, skip it.
 		 */
 		if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
 		    P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
 		    p->p_pid == 1 || P_KILLED(p) ||
 		    (p->p_pid < 48 && swap_pager_avail != 0)) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		/*
 		 * If the process is in a non-running type state,
 		 * don't touch it.  Check all the threads individually.
 		 */
 		breakout = false;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (!TD_ON_RUNQ(td) &&
 			    !TD_IS_RUNNING(td) &&
 			    !TD_IS_SLEEPING(td) &&
 			    !TD_IS_SUSPENDED(td)) {
 				thread_unlock(td);
 				breakout = true;
 				break;
 			}
 			thread_unlock(td);
 		}
 		if (breakout) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		/*
 		 * get the process size
 		 */
 		vm = vmspace_acquire_ref(p);
 		if (vm == NULL) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		_PHOLD(p);
 		PROC_UNLOCK(p);
 		sx_sunlock(&allproc_lock);
 		if (!vm_map_trylock_read(&vm->vm_map)) {
 			vmspace_free(vm);
 			sx_slock(&allproc_lock);
 			PRELE(p);
 			continue;
 		}
 		size = vmspace_swap_count(vm);
 		if (shortage == VM_OOM_MEM || shortage == VM_OOM_MEM_PF)
 			size += vm_pageout_oom_pagecount(vm);
 		vm_map_unlock_read(&vm->vm_map);
 		vmspace_free(vm);
 		sx_slock(&allproc_lock);
 
 		/*
 		 * If this process is bigger than the biggest one,
 		 * remember it.
 		 */
 		if (size > bigsize) {
 			if (bigproc != NULL)
 				PRELE(bigproc);
 			bigproc = p;
 			bigsize = size;
 		} else {
 			PRELE(p);
 		}
 	}
 	sx_sunlock(&allproc_lock);
 
 	if (bigproc != NULL) {
 		switch (shortage) {
 		case VM_OOM_MEM:
 			reason = "failed to reclaim memory";
 			break;
 		case VM_OOM_MEM_PF:
 			reason = "a thread waited too long to allocate a page";
 			break;
 		case VM_OOM_SWAPZ:
 			reason = "out of swap space";
 			break;
 		default:
 			panic("unknown OOM reason %d", shortage);
 		}
 		if (vm_panic_on_oom != 0 && --vm_panic_on_oom == 0)
 			panic("%s", reason);
 		PROC_LOCK(bigproc);
 		killproc(bigproc, reason);
 		sched_nice(bigproc, PRIO_MIN);
 		_PRELE(bigproc);
 		PROC_UNLOCK(bigproc);
 	}
 }
 
 /*
  * Signal a free page shortage to subsystems that have registered an event
  * handler.  Reclaim memory from UMA in the event of a severe shortage.
  * Return true if the free page count should be re-evaluated.
  */
 static bool
 vm_pageout_lowmem(void)
 {
 	static int lowmem_ticks = 0;
 	int last;
 	bool ret;
 
 	ret = false;
 
 	last = atomic_load_int(&lowmem_ticks);
 	while ((u_int)(ticks - last) / hz >= lowmem_period) {
 		if (atomic_fcmpset_int(&lowmem_ticks, &last, ticks) == 0)
 			continue;
 
 		/*
 		 * Decrease registered cache sizes.
 		 */
 		SDT_PROBE0(vm, , , vm__lowmem_scan);
 		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES);
 
 		/*
 		 * We do this explicitly after the caches have been
 		 * drained above.
 		 */
 		uma_reclaim(UMA_RECLAIM_TRIM);
 		ret = true;
 		break;
 	}
 
 	/*
 	 * Kick off an asynchronous reclaim of cached memory if one of the
 	 * page daemons is failing to keep up with demand.  Use the "severe"
 	 * threshold instead of "min" to ensure that we do not blow away the
 	 * caches if a subset of the NUMA domains are depleted by kernel memory
 	 * allocations; the domainset iterators automatically skip domains
 	 * below the "min" threshold on the first pass.
 	 *
 	 * UMA reclaim worker has its own rate-limiting mechanism, so don't
 	 * worry about kicking it too often.
 	 */
 	if (vm_page_count_severe())
 		uma_reclaim_wakeup();
 
 	return (ret);
 }
 
 static void
 vm_pageout_worker(void *arg)
 {
 	struct vm_domain *vmd;
 	u_int ofree;
 	int addl_shortage, domain, shortage;
 	bool target_met;
 
 	domain = (uintptr_t)arg;
 	vmd = VM_DOMAIN(domain);
 	shortage = 0;
 	target_met = true;
 
 	/*
 	 * XXXKIB It could be useful to bind pageout daemon threads to
 	 * the cores belonging to the domain, from which vm_page_array
 	 * is allocated.
 	 */
 
 	KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
 	vmd->vmd_last_active_scan = ticks;
 
 	/*
 	 * The pageout daemon worker is never done, so loop forever.
 	 */
 	while (TRUE) {
 		vm_domain_pageout_lock(vmd);
 
 		/*
 		 * We need to clear wanted before we check the limits.  This
 		 * prevents races with wakers who will check wanted after they
 		 * reach the limit.
 		 */
 		atomic_store_int(&vmd->vmd_pageout_wanted, 0);
 
 		/*
 		 * Might the page daemon need to run again?
 		 */
 		if (vm_paging_needed(vmd, vmd->vmd_free_count)) {
 			/*
 			 * Yes.  If the scan failed to produce enough free
 			 * pages, sleep uninterruptibly for some time in the
 			 * hope that the laundry thread will clean some pages.
 			 */
 			vm_domain_pageout_unlock(vmd);
 			if (!target_met)
 				pause("pwait", hz / VM_INACT_SCAN_RATE);
 		} else {
 			/*
 			 * No, sleep until the next wakeup or until pages
 			 * need to have their reference stats updated.
 			 */
 			if (mtx_sleep(&vmd->vmd_pageout_wanted,
 			    vm_domain_pageout_lockptr(vmd), PDROP | PVM,
 			    "psleep", hz / VM_INACT_SCAN_RATE) == 0)
 				VM_CNT_INC(v_pdwakeups);
 		}
 
 		/* Prevent spurious wakeups by ensuring that wanted is set. */
 		atomic_store_int(&vmd->vmd_pageout_wanted, 1);
 
 		/*
 		 * Use the controller to calculate how many pages to free in
 		 * this interval, and scan the inactive queue.  If the lowmem
 		 * handlers appear to have freed up some pages, subtract the
 		 * difference from the inactive queue scan target.
 		 */
 		shortage = pidctrl_daemon(&vmd->vmd_pid, vmd->vmd_free_count);
 		if (shortage > 0) {
 			ofree = vmd->vmd_free_count;
 			if (vm_pageout_lowmem() && vmd->vmd_free_count > ofree)
 				shortage -= min(vmd->vmd_free_count - ofree,
 				    (u_int)shortage);
 			target_met = vm_pageout_inactive(vmd, shortage,
 			    &addl_shortage);
 		} else
 			addl_shortage = 0;
 
 		/*
 		 * Scan the active queue.  A positive value for shortage
 		 * indicates that we must aggressively deactivate pages to avoid
 		 * a shortfall.
 		 */
 		shortage = vm_pageout_active_target(vmd) + addl_shortage;
 		vm_pageout_scan_active(vmd, shortage);
 	}
 }
 
 /*
  * vm_pageout_helper runs additional pageout daemons in times of high paging
  * activity.
  */
 static void
 vm_pageout_helper(void *arg)
 {
 	struct vm_domain *vmd;
 	int domain;
 
 	domain = (uintptr_t)arg;
 	vmd = VM_DOMAIN(domain);
 
 	vm_domain_pageout_lock(vmd);
 	for (;;) {
 		msleep(&vmd->vmd_inactive_shortage,
 		    vm_domain_pageout_lockptr(vmd), PVM, "psleep", 0);
 		blockcount_release(&vmd->vmd_inactive_starting, 1);
 
 		vm_domain_pageout_unlock(vmd);
 		vm_pageout_scan_inactive(vmd, vmd->vmd_inactive_shortage);
 		vm_domain_pageout_lock(vmd);
 
 		/*
 		 * Release the running count while the pageout lock is held to
 		 * prevent wakeup races.
 		 */
 		blockcount_release(&vmd->vmd_inactive_running, 1);
 	}
 }
 
 static int
 get_pageout_threads_per_domain(const struct vm_domain *vmd)
 {
 	unsigned total_pageout_threads, eligible_cpus, domain_cpus;
 
 	if (VM_DOMAIN_EMPTY(vmd->vmd_domain))
 		return (0);
 
 	/*
 	 * Semi-arbitrarily constrain pagedaemon threads to less than half the
 	 * total number of CPUs in the system as an upper limit.
 	 */
 	if (pageout_cpus_per_thread < 2)
 		pageout_cpus_per_thread = 2;
 	else if (pageout_cpus_per_thread > mp_ncpus)
 		pageout_cpus_per_thread = mp_ncpus;
 
 	total_pageout_threads = howmany(mp_ncpus, pageout_cpus_per_thread);
 	domain_cpus = CPU_COUNT(&cpuset_domain[vmd->vmd_domain]);
 
 	/* Pagedaemons are not run in empty domains. */
 	eligible_cpus = mp_ncpus;
 	for (unsigned i = 0; i < vm_ndomains; i++)
 		if (VM_DOMAIN_EMPTY(i))
 			eligible_cpus -= CPU_COUNT(&cpuset_domain[i]);
 
 	/*
 	 * Assign a portion of the total pageout threads to this domain
 	 * corresponding to the fraction of pagedaemon-eligible CPUs in the
 	 * domain.  In asymmetric NUMA systems, domains with more CPUs may be
 	 * allocated more threads than domains with fewer CPUs.
 	 */
 	return (howmany(total_pageout_threads * domain_cpus, eligible_cpus));
 }
 
 /*
  * Initialize basic pageout daemon settings.  See the comment above the
  * definition of vm_domain for some explanation of how these thresholds are
  * used.
  */
 static void
 vm_pageout_init_domain(int domain)
 {
 	struct vm_domain *vmd;
 	struct sysctl_oid *oid;
 
 	vmd = VM_DOMAIN(domain);
 	vmd->vmd_interrupt_free_min = 2;
 
 	/*
 	 * v_free_reserved needs to include enough for the largest
 	 * swap pager structures plus enough for any pv_entry structs
 	 * when paging. 
 	 */
 	vmd->vmd_pageout_free_min = 2 * MAXBSIZE / PAGE_SIZE +
 	    vmd->vmd_interrupt_free_min;
 	vmd->vmd_free_reserved = vm_pageout_page_count +
 	    vmd->vmd_pageout_free_min + vmd->vmd_page_count / 768;
 	vmd->vmd_free_min = vmd->vmd_page_count / 200;
 	vmd->vmd_free_severe = vmd->vmd_free_min / 2;
 	vmd->vmd_free_target = 4 * vmd->vmd_free_min + vmd->vmd_free_reserved;
 	vmd->vmd_free_min += vmd->vmd_free_reserved;
 	vmd->vmd_free_severe += vmd->vmd_free_reserved;
 	vmd->vmd_inactive_target = (3 * vmd->vmd_free_target) / 2;
 	if (vmd->vmd_inactive_target > vmd->vmd_free_count / 3)
 		vmd->vmd_inactive_target = vmd->vmd_free_count / 3;
 
 	/*
 	 * Set the default wakeup threshold to be 10% below the paging
 	 * target.  This keeps the steady state out of shortfall.
 	 */
 	vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_target / 10) * 9;
 
 	/*
 	 * Target amount of memory to move out of the laundry queue during a
 	 * background laundering.  This is proportional to the amount of system
 	 * memory.
 	 */
 	vmd->vmd_background_launder_target = (vmd->vmd_free_target -
 	    vmd->vmd_free_min) / 10;
 
 	/* Initialize the pageout daemon pid controller. */
 	pidctrl_init(&vmd->vmd_pid, hz / VM_INACT_SCAN_RATE,
 	    vmd->vmd_free_target, PIDCTRL_BOUND,
 	    PIDCTRL_KPD, PIDCTRL_KID, PIDCTRL_KDD);
 	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO,
 	    "pidctrl", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 	pidctrl_init_sysctl(&vmd->vmd_pid, SYSCTL_CHILDREN(oid));
 
 	vmd->vmd_inactive_threads = get_pageout_threads_per_domain(vmd);
 	SYSCTL_ADD_BOOL(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO,
 	    "pageout_helper_threads_enabled", CTLFLAG_RWTUN,
 	    &vmd->vmd_helper_threads_enabled, 0,
 	    "Enable multi-threaded inactive queue scanning");
 }
 
 static void
 vm_pageout_init(void)
 {
 	u_long freecount;
 	int i;
 
 	/*
 	 * Initialize some paging parameters.
 	 */
 	freecount = 0;
 	for (i = 0; i < vm_ndomains; i++) {
 		struct vm_domain *vmd;
 
 		vm_pageout_init_domain(i);
 		vmd = VM_DOMAIN(i);
 		vm_cnt.v_free_reserved += vmd->vmd_free_reserved;
 		vm_cnt.v_free_target += vmd->vmd_free_target;
 		vm_cnt.v_free_min += vmd->vmd_free_min;
 		vm_cnt.v_inactive_target += vmd->vmd_inactive_target;
 		vm_cnt.v_pageout_free_min += vmd->vmd_pageout_free_min;
 		vm_cnt.v_interrupt_free_min += vmd->vmd_interrupt_free_min;
 		vm_cnt.v_free_severe += vmd->vmd_free_severe;
 		freecount += vmd->vmd_free_count;
 	}
 
 	/*
 	 * Set interval in seconds for active scan.  We want to visit each
 	 * page at least once every ten minutes.  This is to prevent worst
 	 * case paging behaviors with stale active LRU.
 	 */
 	if (vm_pageout_update_period == 0)
 		vm_pageout_update_period = 600;
 
 	/*
 	 * Set the maximum number of user-wired virtual pages.  Historically the
 	 * main source of such pages was mlock(2) and mlockall(2).  Hypervisors
 	 * may also request user-wired memory.
 	 */
 	if (vm_page_max_user_wired == 0)
 		vm_page_max_user_wired = 4 * freecount / 5;
 }
 
 /*
  *     vm_pageout is the high level pageout daemon.
  */
 static void
 vm_pageout(void)
 {
 	struct proc *p;
 	struct thread *td;
 	int error, first, i, j, pageout_threads;
 
 	p = curproc;
 	td = curthread;
 
 	mtx_init(&vm_oom_ratelim_mtx, "vmoomr", NULL, MTX_DEF);
 	swap_pager_swap_init();
 	for (first = -1, i = 0; i < vm_ndomains; i++) {
 		if (VM_DOMAIN_EMPTY(i)) {
 			if (bootverbose)
 				printf("domain %d empty; skipping pageout\n",
 				    i);
 			continue;
 		}
 		if (first == -1)
 			first = i;
 		else {
 			error = kthread_add(vm_pageout_worker,
 			    (void *)(uintptr_t)i, p, NULL, 0, 0, "dom%d", i);
 			if (error != 0)
 				panic("starting pageout for domain %d: %d\n",
 				    i, error);
 		}
 		pageout_threads = VM_DOMAIN(i)->vmd_inactive_threads;
 		for (j = 0; j < pageout_threads - 1; j++) {
 			error = kthread_add(vm_pageout_helper,
 			    (void *)(uintptr_t)i, p, NULL, 0, 0,
 			    "dom%d helper%d", i, j);
 			if (error != 0)
 				panic("starting pageout helper %d for domain "
 				    "%d: %d\n", j, i, error);
 		}
 		error = kthread_add(vm_pageout_laundry_worker,
 		    (void *)(uintptr_t)i, p, NULL, 0, 0, "laundry: dom%d", i);
 		if (error != 0)
 			panic("starting laundry for domain %d: %d", i, error);
 	}
 	error = kthread_add(uma_reclaim_worker, NULL, p, NULL, 0, 0, "uma");
 	if (error != 0)
 		panic("starting uma_reclaim helper, error %d\n", error);
 
 	snprintf(td->td_name, sizeof(td->td_name), "dom%d", first);
 	vm_pageout_worker((void *)(uintptr_t)first);
 }
 
 /*
  * Perform an advisory wakeup of the page daemon.
  */
 void
 pagedaemon_wakeup(int domain)
 {
 	struct vm_domain *vmd;
 
 	vmd = VM_DOMAIN(domain);
 	vm_domain_pageout_assert_unlocked(vmd);
 	if (curproc == pageproc)
 		return;
 
 	if (atomic_fetchadd_int(&vmd->vmd_pageout_wanted, 1) == 0) {
 		vm_domain_pageout_lock(vmd);
 		atomic_store_int(&vmd->vmd_pageout_wanted, 1);
 		wakeup(&vmd->vmd_pageout_wanted);
 		vm_domain_pageout_unlock(vmd);
 	}
 }