Index: head/sys/vm/vm_glue.c
===================================================================
--- head/sys/vm/vm_glue.c	(revision 339600)
+++ head/sys/vm/vm_glue.c	(revision 339601)
@@ -1,599 +1,611 @@
 /*-
  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_glue.c	8.6 (Berkeley) 1/5/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 #include "opt_kstack_pages.h"
 #include "opt_kstack_max_pages.h"
 #include "opt_kstack_usage_prof.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/domainset.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sf_buf.h>
 #include <sys/shm.h>
 #include <sys/vmmeter.h>
 #include <sys/vmem.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/_kstack_cache.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/unistd.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_domainset.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 #include <machine/cpu.h>
 
 /*
  * MPSAFE
  *
  * WARNING!  This code calls vm_map_check_protection() which only checks
  * the associated vm_map_entry range.  It does not determine whether the
  * contents of the memory is actually readable or writable.  In most cases
  * just checking the vm_map_entry is sufficient within the kernel's address
  * space.
  */
 int
 kernacc(void *addr, int len, int rw)
 {
 	boolean_t rv;
 	vm_offset_t saddr, eaddr;
 	vm_prot_t prot;
 
 	KASSERT((rw & ~VM_PROT_ALL) == 0,
 	    ("illegal ``rw'' argument to kernacc (%x)\n", rw));
 
 	if ((vm_offset_t)addr + len > vm_map_max(kernel_map) ||
 	    (vm_offset_t)addr + len < (vm_offset_t)addr)
 		return (FALSE);
 
 	prot = rw;
 	saddr = trunc_page((vm_offset_t)addr);
 	eaddr = round_page((vm_offset_t)addr + len);
 	vm_map_lock_read(kernel_map);
 	rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
 	vm_map_unlock_read(kernel_map);
 	return (rv == TRUE);
 }
 
 /*
  * MPSAFE
  *
  * WARNING!  This code calls vm_map_check_protection() which only checks
  * the associated vm_map_entry range.  It does not determine whether the
  * contents of the memory is actually readable or writable.  vmapbuf(),
  * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be
  * used in conjunction with this call.
  */
 int
 useracc(void *addr, int len, int rw)
 {
 	boolean_t rv;
 	vm_prot_t prot;
 	vm_map_t map;
 
 	KASSERT((rw & ~VM_PROT_ALL) == 0,
 	    ("illegal ``rw'' argument to useracc (%x)\n", rw));
 	prot = rw;
 	map = &curproc->p_vmspace->vm_map;
 	if ((vm_offset_t)addr + len > vm_map_max(map) ||
 	    (vm_offset_t)addr + len < (vm_offset_t)addr) {
 		return (FALSE);
 	}
 	vm_map_lock_read(map);
 	rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr),
 	    round_page((vm_offset_t)addr + len), prot);
 	vm_map_unlock_read(map);
 	return (rv == TRUE);
 }
 
 int
 vslock(void *addr, size_t len)
 {
 	vm_offset_t end, last, start;
 	vm_size_t npages;
 	int error;
 
 	last = (vm_offset_t)addr + len;
 	start = trunc_page((vm_offset_t)addr);
 	end = round_page(last);
 	if (last < (vm_offset_t)addr || end < (vm_offset_t)addr)
 		return (EINVAL);
 	npages = atop(end - start);
 	if (npages > vm_page_max_wired)
 		return (ENOMEM);
 #if 0
 	/*
 	 * XXX - not yet
 	 *
 	 * The limit for transient usage of wired pages should be
 	 * larger than for "permanent" wired pages (mlock()).
 	 *
 	 * Also, the sysctl code, which is the only present user
 	 * of vslock(), does a hard loop on EAGAIN.
 	 */
 	if (npages + vm_wire_count() > vm_page_max_wired)
 		return (EAGAIN);
 #endif
 	error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 	if (error == KERN_SUCCESS) {
 		curthread->td_vslock_sz += len;
 		return (0);
 	}
 
 	/*
 	 * Return EFAULT on error to match copy{in,out}() behaviour
 	 * rather than returning ENOMEM like mlock() would.
 	 */
 	return (EFAULT);
 }
 
 void
 vsunlock(void *addr, size_t len)
 {
 
 	/* Rely on the parameter sanity checks performed by vslock(). */
 	MPASS(curthread->td_vslock_sz >= len);
 	curthread->td_vslock_sz -= len;
 	(void)vm_map_unwire(&curproc->p_vmspace->vm_map,
 	    trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 }
 
 /*
  * Pin the page contained within the given object at the given offset.  If the
  * page is not resident, allocate and load it using the given object's pager.
  * Return the pinned page if successful; otherwise, return NULL.
  */
 static vm_page_t
 vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
 {
 	vm_page_t m;
 	vm_pindex_t pindex;
 	int rv;
 
 	VM_OBJECT_WLOCK(object);
 	pindex = OFF_TO_IDX(offset);
 	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
 	if (m->valid != VM_PAGE_BITS_ALL) {
 		vm_page_xbusy(m);
 		rv = vm_pager_get_pages(object, &m, 1, NULL, NULL);
 		if (rv != VM_PAGER_OK) {
 			vm_page_lock(m);
 			vm_page_free(m);
 			vm_page_unlock(m);
 			m = NULL;
 			goto out;
 		}
 		vm_page_xunbusy(m);
 	}
 	vm_page_lock(m);
 	vm_page_hold(m);
 	vm_page_activate(m);
 	vm_page_unlock(m);
 out:
 	VM_OBJECT_WUNLOCK(object);
 	return (m);
 }
 
 /*
  * Return a CPU private mapping to the page at the given offset within the
  * given object.  The page is pinned before it is mapped.
  */
 struct sf_buf *
 vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset)
 {
 	vm_page_t m;
 
 	m = vm_imgact_hold_page(object, offset);
 	if (m == NULL)
 		return (NULL);
 	sched_pin();
 	return (sf_buf_alloc(m, SFB_CPUPRIVATE));
 }
 
 /*
  * Destroy the given CPU private mapping and unpin the page that it mapped.
  */
 void
 vm_imgact_unmap_page(struct sf_buf *sf)
 {
 	vm_page_t m;
 
 	m = sf_buf_page(sf);
 	sf_buf_free(sf);
 	sched_unpin();
 	vm_page_lock(m);
 	vm_page_unhold(m);
 	vm_page_unlock(m);
 }
 
 void
 vm_sync_icache(vm_map_t map, vm_offset_t va, vm_offset_t sz)
 {
 
 	pmap_sync_icache(map->pmap, va, sz);
 }
 
 struct kstack_cache_entry *kstack_cache;
 static int kstack_cache_size = 128;
-static int kstacks;
+static int kstacks, kstack_domain_iter;
 static struct mtx kstack_cache_mtx;
 MTX_SYSINIT(kstack_cache, &kstack_cache_mtx, "kstkch", MTX_DEF);
 
 SYSCTL_INT(_vm, OID_AUTO, kstack_cache_size, CTLFLAG_RW, &kstack_cache_size, 0,
     "");
 SYSCTL_INT(_vm, OID_AUTO, kstacks, CTLFLAG_RD, &kstacks, 0,
     "");
 
 /*
  * Create the kernel stack (including pcb for i386) for a new thread.
  * This routine directly affects the fork perf for a process and
  * create performance for a thread.
  */
 int
 vm_thread_new(struct thread *td, int pages)
 {
 	vm_object_t ksobj;
 	vm_offset_t ks;
 	vm_page_t ma[KSTACK_MAX_PAGES];
 	struct kstack_cache_entry *ks_ce;
 	int i;
 
 	/* Bounds check */
 	if (pages <= 1)
 		pages = kstack_pages;
 	else if (pages > KSTACK_MAX_PAGES)
 		pages = KSTACK_MAX_PAGES;
 
 	if (pages == kstack_pages && kstack_cache != NULL) {
 		mtx_lock(&kstack_cache_mtx);
 		if (kstack_cache != NULL) {
 			ks_ce = kstack_cache;
 			kstack_cache = ks_ce->next_ks_entry;
 			mtx_unlock(&kstack_cache_mtx);
 
 			td->td_kstack_obj = ks_ce->ksobj;
 			td->td_kstack = (vm_offset_t)ks_ce;
 			td->td_kstack_pages = kstack_pages;
 			return (1);
 		}
 		mtx_unlock(&kstack_cache_mtx);
 	}
 
 	/*
 	 * Allocate an object for the kstack.
 	 */
 	ksobj = vm_object_allocate(OBJT_DEFAULT, pages);
 	
 	/*
 	 * Get a kernel virtual address for this thread's kstack.
 	 */
 #if defined(__mips__)
 	/*
 	 * We need to align the kstack's mapped address to fit within
 	 * a single TLB entry.
 	 */
 	if (vmem_xalloc(kernel_arena, (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE,
 	    PAGE_SIZE * 2, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX,
 	    M_BESTFIT | M_NOWAIT, &ks)) {
 		ks = 0;
 	}
 #else
 	ks = kva_alloc((pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
 #endif
 	if (ks == 0) {
 		printf("vm_thread_new: kstack allocation failed\n");
 		vm_object_deallocate(ksobj);
 		return (0);
+	}
+
+	/*
+	 * Ensure that kstack objects can draw pages from any memory
+	 * domain.  Otherwise a local memory shortage can block a process
+	 * swap-in.
+	 */
+	if (vm_ndomains > 1) {
+		ksobj->domain.dr_policy = DOMAINSET_RR();
+		ksobj->domain.dr_iter = atomic_fetchadd_int(&kstack_domain_iter,
+		    1);
 	}
 
 	atomic_add_int(&kstacks, 1);
 	if (KSTACK_GUARD_PAGES != 0) {
 		pmap_qremove(ks, KSTACK_GUARD_PAGES);
 		ks += KSTACK_GUARD_PAGES * PAGE_SIZE;
 	}
 	td->td_kstack_obj = ksobj;
 	td->td_kstack = ks;
 	/*
 	 * Knowing the number of pages allocated is useful when you
 	 * want to deallocate them.
 	 */
 	td->td_kstack_pages = pages;
 	/* 
 	 * For the length of the stack, link in a real page of ram for each
 	 * page of stack.
 	 */
 	VM_OBJECT_WLOCK(ksobj);
 	(void)vm_page_grab_pages(ksobj, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY |
 	    VM_ALLOC_WIRED, ma, pages);
 	for (i = 0; i < pages; i++)
 		ma[i]->valid = VM_PAGE_BITS_ALL;
 	VM_OBJECT_WUNLOCK(ksobj);
 	pmap_qenter(ks, ma, pages);
 	return (1);
 }
 
 static void
 vm_thread_stack_dispose(vm_object_t ksobj, vm_offset_t ks, int pages)
 {
 	vm_page_t m;
 	int i;
 
 	atomic_add_int(&kstacks, -1);
 	pmap_qremove(ks, pages);
 	VM_OBJECT_WLOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
 			panic("vm_thread_dispose: kstack already missing?");
 		vm_page_lock(m);
 		vm_page_unwire(m, PQ_NONE);
 		vm_page_free(m);
 		vm_page_unlock(m);
 	}
 	VM_OBJECT_WUNLOCK(ksobj);
 	vm_object_deallocate(ksobj);
 	kva_free(ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
 	    (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
 }
 
 /*
  * Dispose of a thread's kernel stack.
  */
 void
 vm_thread_dispose(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_offset_t ks;
 	struct kstack_cache_entry *ks_ce;
 	int pages;
 
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	ks = td->td_kstack;
 	td->td_kstack = 0;
 	td->td_kstack_pages = 0;
 	if (pages == kstack_pages && kstacks <= kstack_cache_size) {
 		ks_ce = (struct kstack_cache_entry *)ks;
 		ks_ce->ksobj = ksobj;
 		mtx_lock(&kstack_cache_mtx);
 		ks_ce->next_ks_entry = kstack_cache;
 		kstack_cache = ks_ce;
 		mtx_unlock(&kstack_cache_mtx);
 		return;
 	}
 	vm_thread_stack_dispose(ksobj, ks, pages);
 }
 
 static void
 vm_thread_stack_lowmem(void *nulll)
 {
 	struct kstack_cache_entry *ks_ce, *ks_ce1;
 
 	mtx_lock(&kstack_cache_mtx);
 	ks_ce = kstack_cache;
 	kstack_cache = NULL;
 	mtx_unlock(&kstack_cache_mtx);
 
 	while (ks_ce != NULL) {
 		ks_ce1 = ks_ce;
 		ks_ce = ks_ce->next_ks_entry;
 
 		vm_thread_stack_dispose(ks_ce1->ksobj, (vm_offset_t)ks_ce1,
 		    kstack_pages);
 	}
 }
 
 static void
 kstack_cache_init(void *nulll)
 {
 
 	EVENTHANDLER_REGISTER(vm_lowmem, vm_thread_stack_lowmem, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 
 SYSINIT(vm_kstacks, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY, kstack_cache_init, NULL);
 
 #ifdef KSTACK_USAGE_PROF
 /*
  * Track maximum stack used by a thread in kernel.
  */
 static int max_kstack_used;
 
 SYSCTL_INT(_debug, OID_AUTO, max_kstack_used, CTLFLAG_RD,
     &max_kstack_used, 0,
     "Maxiumum stack depth used by a thread in kernel");
 
 void
 intr_prof_stack_use(struct thread *td, struct trapframe *frame)
 {
 	vm_offset_t stack_top;
 	vm_offset_t current;
 	int used, prev_used;
 
 	/*
 	 * Testing for interrupted kernel mode isn't strictly
 	 * needed. It optimizes the execution, since interrupts from
 	 * usermode will have only the trap frame on the stack.
 	 */
 	if (TRAPF_USERMODE(frame))
 		return;
 
 	stack_top = td->td_kstack + td->td_kstack_pages * PAGE_SIZE;
 	current = (vm_offset_t)(uintptr_t)&stack_top;
 
 	/*
 	 * Try to detect if interrupt is using kernel thread stack.
 	 * Hardware could use a dedicated stack for interrupt handling.
 	 */
 	if (stack_top <= current || current < td->td_kstack)
 		return;
 
 	used = stack_top - current;
 	for (;;) {
 		prev_used = max_kstack_used;
 		if (prev_used >= used)
 			break;
 		if (atomic_cmpset_int(&max_kstack_used, prev_used, used))
 			break;
 	}
 }
 #endif /* KSTACK_USAGE_PROF */
 
 /*
  * Implement fork's actions on an address space.
  * Here we arrange for the address space to be copied or referenced,
  * allocate a user struct (pcb and kernel stack), then call the
  * machine-dependent layer to fill those in and make the new process
  * ready to run.  The new process is set up so that it returns directly
  * to user mode to avoid stack copying and relocation problems.
  */
 int
 vm_forkproc(struct thread *td, struct proc *p2, struct thread *td2,
     struct vmspace *vm2, int flags)
 {
 	struct proc *p1 = td->td_proc;
 	struct domainset *dset;
 	int error;
 
 	if ((flags & RFPROC) == 0) {
 		/*
 		 * Divorce the memory, if it is shared, essentially
 		 * this changes shared memory amongst threads, into
 		 * COW locally.
 		 */
 		if ((flags & RFMEM) == 0) {
 			if (p1->p_vmspace->vm_refcnt > 1) {
 				error = vmspace_unshare(p1);
 				if (error)
 					return (error);
 			}
 		}
 		cpu_fork(td, p2, td2, flags);
 		return (0);
 	}
 
 	if (flags & RFMEM) {
 		p2->p_vmspace = p1->p_vmspace;
 		atomic_add_int(&p1->p_vmspace->vm_refcnt, 1);
 	}
 	dset = td2->td_domain.dr_policy;
 	while (vm_page_count_severe_set(&dset->ds_mask)) {
 		vm_wait_doms(&dset->ds_mask);
 	}
 
 	if ((flags & RFMEM) == 0) {
 		p2->p_vmspace = vm2;
 		if (p1->p_vmspace->vm_shm)
 			shmfork(p1, p2);
 	}
 
 	/*
 	 * cpu_fork will copy and update the pcb, set up the kernel stack,
 	 * and make the child ready to run.
 	 */
 	cpu_fork(td, p2, td2, flags);
 	return (0);
 }
 
 /*
  * Called after process has been wait(2)'ed upon and is being reaped.
  * The idea is to reclaim resources that we could not reclaim while
  * the process was still executing.
  */
 void
 vm_waitproc(p)
 	struct proc *p;
 {
 
 	vmspace_exitfree(p);		/* and clean-out the vmspace */
 }
 
 void
 kick_proc0(void)
 {
 
 	wakeup(&proc0);
 }
Index: head/sys/vm/vm_swapout.c
===================================================================
--- head/sys/vm/vm_swapout.c	(revision 339600)
+++ head/sys/vm/vm_swapout.c	(revision 339601)
@@ -1,963 +1,964 @@
 /*-
  * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2005 Yahoo! Technologies Norway AS
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_kstack_pages.h"
 #include "opt_kstack_max_pages.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/kernel.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/_kstack_cache.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/mount.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 /* the kernel process "vm_daemon" */
 static void vm_daemon(void);
 static struct proc *vmproc;
 
 static struct kproc_desc vm_kp = {
 	"vmdaemon",
 	vm_daemon,
 	&vmproc
 };
 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
 
 static int vm_swap_enabled = 1;
 static int vm_swap_idle_enabled = 0;
 
 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW,
     &vm_swap_enabled, 0,
     "Enable entire process swapout");
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW,
     &vm_swap_idle_enabled, 0,
     "Allow swapout on idle criteria");
 
 /*
  * Swap_idle_threshold1 is the guaranteed swapped in time for a process
  */
 static int swap_idle_threshold1 = 2;
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
     &swap_idle_threshold1, 0,
     "Guaranteed swapped in time for a process");
 
 /*
  * Swap_idle_threshold2 is the time that a process can be idle before
  * it will be swapped out, if idle swapping is enabled.
  */
 static int swap_idle_threshold2 = 10;
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
     &swap_idle_threshold2, 0,
     "Time before a process will be swapped out");
 
 static int vm_pageout_req_swapout;	/* XXX */
 static int vm_daemon_needed;
 static struct mtx vm_daemon_mtx;
 /* Allow for use by vm_pageout before vm_daemon is initialized. */
 MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
 
 static int swapped_cnt;
 static int swap_inprogress;	/* Pending swap-ins done outside swapper. */
 static int last_swapin;
 
 static void swapclear(struct proc *);
 static int swapout(struct proc *);
 static void vm_swapout_map_deactivate_pages(vm_map_t, long);
 static void vm_swapout_object_deactivate_pages(pmap_t, vm_object_t, long);
 static void swapout_procs(int action);
 static void vm_req_vmdaemon(int req);
 static void vm_thread_swapout(struct thread *td);
 
 /*
  *	vm_swapout_object_deactivate_pages
  *
  *	Deactivate enough pages to satisfy the inactive target
  *	requirements.
  *
  *	The object and map must be locked.
  */
 static void
 vm_swapout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
     long desired)
 {
 	vm_object_t backing_object, object;
 	vm_page_t p;
 	int act_delta, remove_mode;
 
 	VM_OBJECT_ASSERT_LOCKED(first_object);
 	if ((first_object->flags & OBJ_FICTITIOUS) != 0)
 		return;
 	for (object = first_object;; object = backing_object) {
 		if (pmap_resident_count(pmap) <= desired)
 			goto unlock_return;
 		VM_OBJECT_ASSERT_LOCKED(object);
 		if ((object->flags & OBJ_UNMANAGED) != 0 ||
 		    object->paging_in_progress != 0)
 			goto unlock_return;
 
 		remove_mode = 0;
 		if (object->shadow_count > 1)
 			remove_mode = 1;
 		/*
 		 * Scan the object's entire memory queue.
 		 */
 		TAILQ_FOREACH(p, &object->memq, listq) {
 			if (pmap_resident_count(pmap) <= desired)
 				goto unlock_return;
 			if (should_yield())
 				goto unlock_return;
 			if (vm_page_busied(p))
 				continue;
 			VM_CNT_INC(v_pdpages);
 			vm_page_lock(p);
 			if (vm_page_held(p) ||
 			    !pmap_page_exists_quick(pmap, p)) {
 				vm_page_unlock(p);
 				continue;
 			}
 			act_delta = pmap_ts_referenced(p);
 			if ((p->aflags & PGA_REFERENCED) != 0) {
 				if (act_delta == 0)
 					act_delta = 1;
 				vm_page_aflag_clear(p, PGA_REFERENCED);
 			}
 			if (!vm_page_active(p) && act_delta != 0) {
 				vm_page_activate(p);
 				p->act_count += act_delta;
 			} else if (vm_page_active(p)) {
 				if (act_delta == 0) {
 					p->act_count -= min(p->act_count,
 					    ACT_DECLINE);
 					if (!remove_mode && p->act_count == 0) {
 						pmap_remove_all(p);
 						vm_page_deactivate(p);
 					} else
 						vm_page_requeue(p);
 				} else {
 					vm_page_activate(p);
 					if (p->act_count < ACT_MAX -
 					    ACT_ADVANCE)
 						p->act_count += ACT_ADVANCE;
 					vm_page_requeue(p);
 				}
 			} else if (vm_page_inactive(p))
 				pmap_remove_all(p);
 			vm_page_unlock(p);
 		}
 		if ((backing_object = object->backing_object) == NULL)
 			goto unlock_return;
 		VM_OBJECT_RLOCK(backing_object);
 		if (object != first_object)
 			VM_OBJECT_RUNLOCK(object);
 	}
 unlock_return:
 	if (object != first_object)
 		VM_OBJECT_RUNLOCK(object);
 }
 
 /*
  * deactivate some number of pages in a map, try to do it fairly, but
  * that is really hard to do.
  */
 static void
 vm_swapout_map_deactivate_pages(vm_map_t map, long desired)
 {
 	vm_map_entry_t tmpe;
 	vm_object_t obj, bigobj;
 	int nothingwired;
 
 	if (!vm_map_trylock_read(map))
 		return;
 
 	bigobj = NULL;
 	nothingwired = TRUE;
 
 	/*
 	 * first, search out the biggest object, and try to free pages from
 	 * that.
 	 */
 	tmpe = map->header.next;
 	while (tmpe != &map->header) {
 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 			obj = tmpe->object.vm_object;
 			if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
 				if (obj->shadow_count <= 1 &&
 				    (bigobj == NULL ||
 				     bigobj->resident_page_count <
 				     obj->resident_page_count)) {
 					if (bigobj != NULL)
 						VM_OBJECT_RUNLOCK(bigobj);
 					bigobj = obj;
 				} else
 					VM_OBJECT_RUNLOCK(obj);
 			}
 		}
 		if (tmpe->wired_count > 0)
 			nothingwired = FALSE;
 		tmpe = tmpe->next;
 	}
 
 	if (bigobj != NULL) {
 		vm_swapout_object_deactivate_pages(map->pmap, bigobj, desired);
 		VM_OBJECT_RUNLOCK(bigobj);
 	}
 	/*
 	 * Next, hunt around for other pages to deactivate.  We actually
 	 * do this search sort of wrong -- .text first is not the best idea.
 	 */
 	tmpe = map->header.next;
 	while (tmpe != &map->header) {
 		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
 			break;
 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 			obj = tmpe->object.vm_object;
 			if (obj != NULL) {
 				VM_OBJECT_RLOCK(obj);
 				vm_swapout_object_deactivate_pages(map->pmap,
 				    obj, desired);
 				VM_OBJECT_RUNLOCK(obj);
 			}
 		}
 		tmpe = tmpe->next;
 	}
 
 	/*
 	 * Remove all mappings if a process is swapped out, this will free page
 	 * table pages.
 	 */
 	if (desired == 0 && nothingwired) {
 		pmap_remove(vm_map_pmap(map), vm_map_min(map),
 		    vm_map_max(map));
 	}
 
 	vm_map_unlock_read(map);
 }
 
 /*
  * Swap out requests
  */
 #define VM_SWAP_NORMAL 1
 #define VM_SWAP_IDLE 2
 
 void
 vm_swapout_run(void)
 {
 
 	if (vm_swap_enabled)
 		vm_req_vmdaemon(VM_SWAP_NORMAL);
 }
 
 /*
  * Idle process swapout -- run once per second when pagedaemons are
  * reclaiming pages.
  */
 void
 vm_swapout_run_idle(void)
 {
 	static long lsec;
 
 	if (!vm_swap_idle_enabled || time_second == lsec)
 		return;
 	vm_req_vmdaemon(VM_SWAP_IDLE);
 	lsec = time_second;
 }
 
 static void
 vm_req_vmdaemon(int req)
 {
 	static int lastrun = 0;
 
 	mtx_lock(&vm_daemon_mtx);
 	vm_pageout_req_swapout |= req;
 	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
 		wakeup(&vm_daemon_needed);
 		lastrun = ticks;
 	}
 	mtx_unlock(&vm_daemon_mtx);
 }
 
 static void
 vm_daemon(void)
 {
 	struct rlimit rsslim;
 	struct proc *p;
 	struct thread *td;
 	struct vmspace *vm;
 	int breakout, swapout_flags, tryagain, attempts;
 #ifdef RACCT
 	uint64_t rsize, ravailable;
 #endif
 
 	while (TRUE) {
 		mtx_lock(&vm_daemon_mtx);
 		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
 #ifdef RACCT
 		    racct_enable ? hz : 0
 #else
 		    0
 #endif
 		);
 		swapout_flags = vm_pageout_req_swapout;
 		vm_pageout_req_swapout = 0;
 		mtx_unlock(&vm_daemon_mtx);
 		if (swapout_flags != 0) {
 			/*
 			 * Drain the per-CPU page queue batches as a deadlock
 			 * avoidance measure.
 			 */
 			if ((swapout_flags & VM_SWAP_NORMAL) != 0)
 				vm_page_drain_pqbatch();
 			swapout_procs(swapout_flags);
 		}
 
 		/*
 		 * scan the processes for exceeding their rlimits or if
 		 * process is swapped out -- deactivate pages
 		 */
 		tryagain = 0;
 		attempts = 0;
 again:
 		attempts++;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			vm_pindex_t limit, size;
 
 			/*
 			 * if this is a system process or if we have already
 			 * looked at this process, skip it.
 			 */
 			PROC_LOCK(p);
 			if (p->p_state != PRS_NORMAL ||
 			    p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * if the process is in a non-running type state,
 			 * don't touch it.
 			 */
 			breakout = 0;
 			FOREACH_THREAD_IN_PROC(p, td) {
 				thread_lock(td);
 				if (!TD_ON_RUNQ(td) &&
 				    !TD_IS_RUNNING(td) &&
 				    !TD_IS_SLEEPING(td) &&
 				    !TD_IS_SUSPENDED(td)) {
 					thread_unlock(td);
 					breakout = 1;
 					break;
 				}
 				thread_unlock(td);
 			}
 			if (breakout) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * get a limit
 			 */
 			lim_rlimit_proc(p, RLIMIT_RSS, &rsslim);
 			limit = OFF_TO_IDX(
 			    qmin(rsslim.rlim_cur, rsslim.rlim_max));
 
 			/*
 			 * let processes that are swapped out really be
 			 * swapped out set the limit to nothing (will force a
 			 * swap-out.)
 			 */
 			if ((p->p_flag & P_INMEM) == 0)
 				limit = 0;	/* XXX */
 			vm = vmspace_acquire_ref(p);
 			_PHOLD_LITE(p);
 			PROC_UNLOCK(p);
 			if (vm == NULL) {
 				PRELE(p);
 				continue;
 			}
 			sx_sunlock(&allproc_lock);
 
 			size = vmspace_resident_count(vm);
 			if (size >= limit) {
 				vm_swapout_map_deactivate_pages(
 				    &vm->vm_map, limit);
 				size = vmspace_resident_count(vm);
 			}
 #ifdef RACCT
 			if (racct_enable) {
 				rsize = IDX_TO_OFF(size);
 				PROC_LOCK(p);
 				if (p->p_state == PRS_NORMAL)
 					racct_set(p, RACCT_RSS, rsize);
 				ravailable = racct_get_available(p, RACCT_RSS);
 				PROC_UNLOCK(p);
 				if (rsize > ravailable) {
 					/*
 					 * Don't be overly aggressive; this
 					 * might be an innocent process,
 					 * and the limit could've been exceeded
 					 * by some memory hog.  Don't try
 					 * to deactivate more than 1/4th
 					 * of process' resident set size.
 					 */
 					if (attempts <= 8) {
 						if (ravailable < rsize -
 						    (rsize / 4)) {
 							ravailable = rsize -
 							    (rsize / 4);
 						}
 					}
 					vm_swapout_map_deactivate_pages(
 					    &vm->vm_map,
 					    OFF_TO_IDX(ravailable));
 					/* Update RSS usage after paging out. */
 					size = vmspace_resident_count(vm);
 					rsize = IDX_TO_OFF(size);
 					PROC_LOCK(p);
 					if (p->p_state == PRS_NORMAL)
 						racct_set(p, RACCT_RSS, rsize);
 					PROC_UNLOCK(p);
 					if (rsize > ravailable)
 						tryagain = 1;
 				}
 			}
 #endif
 			vmspace_free(vm);
 			sx_slock(&allproc_lock);
 			PRELE(p);
 		}
 		sx_sunlock(&allproc_lock);
 		if (tryagain != 0 && attempts <= 10) {
 			maybe_yield();
 			goto again;
 		}
 	}
 }
 
 /*
  * Allow a thread's kernel stack to be paged out.
  */
 static void
 vm_thread_swapout(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_page_t m;
 	int i, pages;
 
 	cpu_thread_swapout(td);
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	pmap_qremove(td->td_kstack, pages);
 	VM_OBJECT_WLOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
 			panic("vm_thread_swapout: kstack already missing?");
 		vm_page_dirty(m);
 		vm_page_lock(m);
 		vm_page_unwire(m, PQ_LAUNDRY);
 		vm_page_unlock(m);
 	}
 	VM_OBJECT_WUNLOCK(ksobj);
 }
 
 /*
  * Bring the kernel stack for a specified thread back in.
  */
 static void
 vm_thread_swapin(struct thread *td, int oom_alloc)
 {
 	vm_object_t ksobj;
 	vm_page_t ma[KSTACK_MAX_PAGES];
 	int a, count, i, j, pages, rv;
 
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	VM_OBJECT_WLOCK(ksobj);
 	(void)vm_page_grab_pages(ksobj, 0, oom_alloc | VM_ALLOC_WIRED, ma,
 	    pages);
 	for (i = 0; i < pages;) {
 		vm_page_assert_xbusied(ma[i]);
 		if (ma[i]->valid == VM_PAGE_BITS_ALL) {
 			vm_page_xunbusy(ma[i]);
 			i++;
 			continue;
 		}
 		vm_object_pip_add(ksobj, 1);
 		for (j = i + 1; j < pages; j++)
 			if (ma[j]->valid == VM_PAGE_BITS_ALL)
 				break;
 		rv = vm_pager_has_page(ksobj, ma[i]->pindex, NULL, &a);
 		KASSERT(rv == 1, ("%s: missing page %p", __func__, ma[i]));
 		count = min(a + 1, j - i);
 		rv = vm_pager_get_pages(ksobj, ma + i, count, NULL, NULL);
 		KASSERT(rv == VM_PAGER_OK, ("%s: cannot get kstack for proc %d",
 		    __func__, td->td_proc->p_pid));
 		vm_object_pip_wakeup(ksobj);
 		for (j = i; j < i + count; j++)
 			vm_page_xunbusy(ma[j]);
 		i += count;
 	}
 	VM_OBJECT_WUNLOCK(ksobj);
 	pmap_qenter(td->td_kstack, ma, pages);
 	cpu_thread_swapin(td);
 }
 
 void
 faultin(struct proc *p)
 {
 	struct thread *td;
 	int oom_alloc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * If another process is swapping in this process,
 	 * just wait until it finishes.
 	 */
 	if (p->p_flag & P_SWAPPINGIN) {
 		while (p->p_flag & P_SWAPPINGIN)
 			msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
 		return;
 	}
 
 	if ((p->p_flag & P_INMEM) == 0) {
 		oom_alloc = (p->p_flag & P_WKILLED) != 0 ? VM_ALLOC_SYSTEM :
 		    VM_ALLOC_NORMAL;
 
 		/*
 		 * Don't let another thread swap process p out while we are
 		 * busy swapping it in.
 		 */
 		++p->p_lock;
 		p->p_flag |= P_SWAPPINGIN;
 		PROC_UNLOCK(p);
 		sx_xlock(&allproc_lock);
 		MPASS(swapped_cnt > 0);
 		swapped_cnt--;
 		if (curthread != &thread0)
 			swap_inprogress++;
 		sx_xunlock(&allproc_lock);
 
 		/*
 		 * We hold no lock here because the list of threads
 		 * can not change while all threads in the process are
 		 * swapped out.
 		 */
 		FOREACH_THREAD_IN_PROC(p, td)
 			vm_thread_swapin(td, oom_alloc);
 
 		if (curthread != &thread0) {
 			sx_xlock(&allproc_lock);
 			MPASS(swap_inprogress > 0);
 			swap_inprogress--;
 			last_swapin = ticks;
 			sx_xunlock(&allproc_lock);
 		}
 		PROC_LOCK(p);
 		swapclear(p);
 		p->p_swtick = ticks;
 
 		/* Allow other threads to swap p out now. */
 		wakeup(&p->p_flag);
 		--p->p_lock;
 	}
 }
 
 /*
  * This swapin algorithm attempts to swap-in processes only if there
  * is enough space for them.  Of course, if a process waits for a long
  * time, it will be swapped in anyway.
  */
 
 static struct proc *
 swapper_selector(bool wkilled_only)
 {
 	struct proc *p, *res;
 	struct thread *td;
 	int ppri, pri, slptime, swtime;
 
 	sx_assert(&allproc_lock, SA_SLOCKED);
 	if (swapped_cnt == 0)
 		return (NULL);
 	res = NULL;
 	ppri = INT_MIN;
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW || (p->p_flag & (P_SWAPPINGOUT |
 		    P_SWAPPINGIN | P_INMEM)) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		if (p->p_state == PRS_NORMAL && (p->p_flag & P_WKILLED) != 0) {
 			/*
 			 * A swapped-out process might have mapped a
 			 * large portion of the system's pages as
 			 * anonymous memory.  There is no other way to
 			 * release the memory other than to kill the
 			 * process, for which we need to swap it in.
 			 */
 			return (p);
 		}
 		if (wkilled_only) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		swtime = (ticks - p->p_swtick) / hz;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			/*
 			 * An otherwise runnable thread of a process
 			 * swapped out has only the TDI_SWAPPED bit set.
 			 */
 			thread_lock(td);
 			if (td->td_inhibitors == TDI_SWAPPED) {
 				slptime = (ticks - td->td_slptick) / hz;
 				pri = swtime + slptime;
 				if ((td->td_flags & TDF_SWAPINREQ) == 0)
 					pri -= p->p_nice * 8;
 				/*
 				 * if this thread is higher priority
 				 * and there is enough space, then select
 				 * this process instead of the previous
 				 * selection.
 				 */
 				if (pri > ppri) {
 					res = p;
 					ppri = pri;
 				}
 			}
 			thread_unlock(td);
 		}
 		PROC_UNLOCK(p);
 	}
 
 	if (res != NULL)
 		PROC_LOCK(res);
 	return (res);
 }
 
 #define	SWAPIN_INTERVAL	(MAXSLP * hz / 2)
 
 /*
  * Limit swapper to swap in one non-WKILLED process in MAXSLP/2
  * interval, assuming that there is:
- * - no memory shortage;
+ * - there exists at least one domain that is not suffering from a shortage of
+ *   free memory;
  * - no parallel swap-ins;
  * - no other swap-ins in the current SWAPIN_INTERVAL.
  */
 static bool
 swapper_wkilled_only(void)
 {
 
-	return (vm_page_count_min() || swap_inprogress > 0 ||
+	return (vm_page_count_min_set(&all_domains) || swap_inprogress > 0 ||
 	    (u_int)(ticks - last_swapin) < SWAPIN_INTERVAL);
 }
 
 void
 swapper(void)
 {
 	struct proc *p;
 
 	for (;;) {
 		sx_slock(&allproc_lock);
 		p = swapper_selector(swapper_wkilled_only());
 		sx_sunlock(&allproc_lock);
 
 		if (p == NULL) {
 			tsleep(&proc0, PVM, "swapin", SWAPIN_INTERVAL);
 		} else {
 			PROC_LOCK_ASSERT(p, MA_OWNED);
 
 			/*
 			 * Another process may be bringing or may have
 			 * already brought this process in while we
 			 * traverse all threads.  Or, this process may
 			 * have exited or even being swapped out
 			 * again.
 			 */
 			if (p->p_state == PRS_NORMAL && (p->p_flag & (P_INMEM |
 			    P_SWAPPINGOUT | P_SWAPPINGIN)) == 0) {
 				faultin(p);
 			}
 			PROC_UNLOCK(p);
 		}
 	}
 }
 
 /*
  * First, if any processes have been sleeping or stopped for at least
  * "swap_idle_threshold1" seconds, they are swapped out.  If, however,
  * no such processes exist, then the longest-sleeping or stopped
  * process is swapped out.  Finally, and only as a last resort, if
  * there are no sleeping or stopped processes, the longest-resident
  * process is swapped out.
  */
 static void
 swapout_procs(int action)
 {
 	struct proc *p;
 	struct thread *td;
 	int slptime;
 	bool didswap, doswap;
 
 	MPASS((action & (VM_SWAP_NORMAL | VM_SWAP_IDLE)) != 0);
 
 	didswap = false;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		/*
 		 * Filter out not yet fully constructed processes.  Do
 		 * not swap out held processes.  Avoid processes which
 		 * are system, exiting, execing, traced, already swapped
 		 * out or are in the process of being swapped in or out.
 		 */
 		PROC_LOCK(p);
 		if (p->p_state != PRS_NORMAL || p->p_lock != 0 || (p->p_flag &
 		    (P_SYSTEM | P_WEXIT | P_INEXEC | P_STOPPED_SINGLE |
 		    P_TRACED | P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) !=
 		    P_INMEM) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 
 		/*
 		 * Further consideration of this process for swap out
 		 * requires iterating over its threads.  We release
 		 * allproc_lock here so that process creation and
 		 * destruction are not blocked while we iterate.
 		 *
 		 * To later reacquire allproc_lock and resume
 		 * iteration over the allproc list, we will first have
 		 * to release the lock on the process.  We place a
 		 * hold on the process so that it remains in the
 		 * allproc list while it is unlocked.
 		 */
 		_PHOLD_LITE(p);
 		sx_sunlock(&allproc_lock);
 
 		/*
 		 * Do not swapout a realtime process.
 		 * Guarantee swap_idle_threshold1 time in memory.
 		 * If the system is under memory stress, or if we are
 		 * swapping idle processes >= swap_idle_threshold2,
 		 * then swap the process out.
 		 */
 		doswap = true;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			slptime = (ticks - td->td_slptick) / hz;
 			if (PRI_IS_REALTIME(td->td_pri_class) ||
 			    slptime < swap_idle_threshold1 ||
 			    !thread_safetoswapout(td) ||
 			    ((action & VM_SWAP_NORMAL) == 0 &&
 			    slptime < swap_idle_threshold2))
 				doswap = false;
 			thread_unlock(td);
 			if (!doswap)
 				break;
 		}
 		if (doswap && swapout(p) == 0)
 			didswap = true;
 
 		PROC_UNLOCK(p);
 		if (didswap) {
 			sx_xlock(&allproc_lock);
 			swapped_cnt++;
 			sx_downgrade(&allproc_lock);
 		} else
 			sx_slock(&allproc_lock);
 		PRELE(p);
 	}
 	sx_sunlock(&allproc_lock);
 
 	/*
 	 * If we swapped something out, and another process needed memory,
 	 * then wakeup the sched process.
 	 */
 	if (didswap)
 		wakeup(&proc0);
 }
 
 static void
 swapclear(struct proc *p)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		td->td_flags |= TDF_INMEM;
 		td->td_flags &= ~TDF_SWAPINREQ;
 		TD_CLR_SWAPPED(td);
 		if (TD_CAN_RUN(td))
 			if (setrunnable(td)) {
 #ifdef INVARIANTS
 				/*
 				 * XXX: We just cleared TDI_SWAPPED
 				 * above and set TDF_INMEM, so this
 				 * should never happen.
 				 */
 				panic("not waking up swapper");
 #endif
 			}
 		thread_unlock(td);
 	}
 	p->p_flag &= ~(P_SWAPPINGIN | P_SWAPPINGOUT);
 	p->p_flag |= P_INMEM;
 }
 
 static int
 swapout(struct proc *p)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * The states of this process and its threads may have changed
 	 * by now.  Assuming that there is only one pageout daemon thread,
 	 * this process should still be in memory.
 	 */
 	KASSERT((p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) ==
 	    P_INMEM, ("swapout: lost a swapout race?"));
 
 	/*
 	 * Remember the resident count.
 	 */
 	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
 
 	/*
 	 * Check and mark all threads before we proceed.
 	 */
 	p->p_flag &= ~P_INMEM;
 	p->p_flag |= P_SWAPPINGOUT;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		if (!thread_safetoswapout(td)) {
 			thread_unlock(td);
 			swapclear(p);
 			return (EBUSY);
 		}
 		td->td_flags &= ~TDF_INMEM;
 		TD_SET_SWAPPED(td);
 		thread_unlock(td);
 	}
 	td = FIRST_THREAD_IN_PROC(p);
 	++td->td_ru.ru_nswap;
 	PROC_UNLOCK(p);
 
 	/*
 	 * This list is stable because all threads are now prevented from
 	 * running.  The list is only modified in the context of a running
 	 * thread in this process.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td)
 		vm_thread_swapout(td);
 
 	PROC_LOCK(p);
 	p->p_flag &= ~P_SWAPPINGOUT;
 	p->p_swtick = ticks;
 	return (0);
 }