D14000.id.diff
No OneTemporary
Actions

Size

104 KB

Referenced Files

None

Subscribers

None

D14000.id.diff
View Options

	Index: sys/amd64/amd64/machdep.c
	===================================================================
	--- sys/amd64/amd64/machdep.c
	+++ sys/amd64/amd64/machdep.c
	@@ -279,7 +279,7 @@
	memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
	freeenv(sysenv);
	}
	- if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
	+ if (memsize < ptoa((uintmax_t)vm_free_count()))
	memsize = ptoa((uintmax_t)Maxmem);
	printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
	realmem = atop(memsize);
	@@ -306,8 +306,8 @@
	vm_ksubmap_init(&kmi);

	printf("avail memory = %ju (%ju MB)\n",
	- ptoa((uintmax_t)vm_cnt.v_free_count),
	- ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
	+ ptoa((uintmax_t)vm_free_count()),
	+ ptoa((uintmax_t)vm_free_count()) / 1048576);

	/*
	* Set up buffers, so they can be used to read disk labels.
	Index: sys/arm/arm/machdep.c
	===================================================================
	--- sys/arm/arm/machdep.c
	+++ sys/arm/arm/machdep.c
	@@ -228,8 +228,8 @@
	(uintmax_t)arm32_ptob(realmem),
	(uintmax_t)arm32_ptob(realmem) / mbyte);
	printf("avail memory = %ju (%ju MB)\n",
	- (uintmax_t)arm32_ptob(vm_cnt.v_free_count),
	- (uintmax_t)arm32_ptob(vm_cnt.v_free_count) / mbyte);
	+ (uintmax_t)arm32_ptob(vm_free_count()),
	+ (uintmax_t)arm32_ptob(vm_free_count()) / mbyte);
	if (bootverbose) {
	arm_physmem_print_tables();
	devmap_print_table();
	Index: sys/arm/arm/pmap-v4.c
	===================================================================
	--- sys/arm/arm/pmap-v4.c
	+++ sys/arm/arm/pmap-v4.c
	@@ -3817,7 +3817,7 @@

	pv_entry_count++;
	if (pv_entry_count > pv_entry_high_water)
	- pagedaemon_wakeup();
	+ pagedaemon_wakeup(0); /* XXX ARM NUMA */
	ret_value = uma_zalloc(pvzone, M_NOWAIT);
	return ret_value;
	}
	Index: sys/cddl/compat/opensolaris/sys/kmem.h
	===================================================================
	--- sys/cddl/compat/opensolaris/sys/kmem.h
	+++ sys/cddl/compat/opensolaris/sys/kmem.h
	@@ -78,7 +78,7 @@
	int kmem_debugging(void);
	void *calloc(size_t n, size_t s);

	-#define freemem vm_cnt.v_free_count
	+#define freemem vm_free_count()
	#define minfree vm_cnt.v_free_min
	#define heap_arena kernel_arena
	#define zio_arena NULL
	Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
	===================================================================
	--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
	+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
	@@ -379,7 +379,7 @@
	arc_free_target_init(void *unused __unused)
	{

	- zfs_arc_free_target = vm_pageout_wakeup_thresh;
	+ zfs_arc_free_target = (vm_cnt.v_free_min / 10) * 11;
	}
	SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
	arc_free_target_init, NULL);
	Index: sys/compat/linprocfs/linprocfs.c
	===================================================================
	--- sys/compat/linprocfs/linprocfs.c
	+++ sys/compat/linprocfs/linprocfs.c
	@@ -156,7 +156,7 @@
	/*
	* The correct thing here would be:
	*
	- memfree = vm_cnt.v_free_count * PAGE_SIZE;
	+ memfree = vm_free_count() * PAGE_SIZE;
	memused = memtotal - memfree;
	*
	* but it might mislead linux binaries into thinking there
	@@ -178,7 +178,7 @@
	* like unstaticizing it just for linprocfs's sake.
	*/
	buffers = 0;
	- cached = vm_cnt.v_inactive_count * PAGE_SIZE;
	+ cached = vm_inactive_count() * PAGE_SIZE;

	sbuf_printf(sb,
	"MemTotal: %9lu kB\n"
	Index: sys/fs/tmpfs/tmpfs_subr.c
	===================================================================
	--- sys/fs/tmpfs/tmpfs_subr.c
	+++ sys/fs/tmpfs/tmpfs_subr.c
	@@ -106,7 +106,8 @@
	{
	vm_ooffset_t avail;

	- avail = swap_pager_avail + vm_cnt.v_free_count - tmpfs_pages_reserved;
	+ /* XXX */
	+ avail = swap_pager_avail + vm_free_count() - tmpfs_pages_reserved;
	if (__predict_false(avail < 0))
	avail = 0;
	return (avail);
	Index: sys/i386/i386/machdep.c
	===================================================================
	--- sys/i386/i386/machdep.c
	+++ sys/i386/i386/machdep.c
	@@ -271,7 +271,7 @@
	memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
	freeenv(sysenv);
	}
	- if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
	+ if (memsize < ptoa((uintmax_t)vm_free_count()))
	memsize = ptoa((uintmax_t)Maxmem);
	printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
	realmem = atop(memsize);
	@@ -298,8 +298,8 @@
	vm_ksubmap_init(&kmi);

	printf("avail memory = %ju (%ju MB)\n",
	- ptoa((uintmax_t)vm_cnt.v_free_count),
	- ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
	+ ptoa((uintmax_t)vm_free_count()),
	+ ptoa((uintmax_t)vm_free_count()) / 1048576);

	/*
	* Set up buffers, so they can be used to read disk labels.
	Index: sys/kern/init_main.c
	===================================================================
	--- sys/kern/init_main.c
	+++ sys/kern/init_main.c
	@@ -87,6 +87,7 @@

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	+#include <vm/vm_extern.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <sys/copyright.h>
	@@ -555,7 +556,7 @@
	p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
	p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
	/* Cast to avoid overflow on i386/PAE. */
	- pageablemem = ptoa((vm_paddr_t)vm_cnt.v_free_count);
	+ pageablemem = ptoa((vm_paddr_t)vm_free_count());
	p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur =
	p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem;
	p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3;
	Index: sys/kern/subr_vmem.c
	===================================================================
	--- sys/kern/subr_vmem.c
	+++ sys/kern/subr_vmem.c
	@@ -59,6 +59,7 @@
	#include <sys/sysctl.h>
	#include <sys/taskqueue.h>
	#include <sys/vmem.h>
	+#include <sys/vmmeter.h>

	#include "opt_vm.h"

	@@ -72,6 +73,8 @@
	#include <vm/vm_param.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pageout.h>
	+#include <vm/vm_phys.h>
	+#include <vm/vm_pagequeue.h>

	#define VMEM_OPTORDER 5
	#define VMEM_OPTVALUE (1 << VMEM_OPTORDER)
	@@ -641,7 +644,7 @@
	* possible due to M_USE_RESERVE page allocation.
	*/
	if (wait & M_WAITOK)
	- VM_WAIT;
	+ vm_wait_domain(domain);
	return (NULL);
	}
	mtx_unlock(&vmem_bt_lock);
	Index: sys/kern/subr_witness.c
	===================================================================
	--- sys/kern/subr_witness.c
	+++ sys/kern/subr_witness.c
	@@ -139,7 +139,7 @@
	#define WITNESS_COUNT 1536
	#endif
	#define WITNESS_HASH_SIZE 251 /* Prime, gives load factor < 2 */
	-#define WITNESS_PENDLIST (2048 + MAXCPU)
	+#define WITNESS_PENDLIST (2048 + (MAXCPU * 4))

	/* Allocate 256 KB of stack data space */
	#define WITNESS_LO_DATA_COUNT 2048
	Index: sys/mips/mips/machdep.c
	===================================================================
	--- sys/mips/mips/machdep.c
	+++ sys/mips/mips/machdep.c
	@@ -210,8 +210,8 @@
	vm_ksubmap_init(&kmi);

	printf("avail memory = %ju (%juMB)\n",
	- ptoa((uintmax_t)vm_cnt.v_free_count),
	- ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
	+ ptoa((uintmax_t)vm_free_count()),
	+ ptoa((uintmax_t)vm_free_count()) / 1048576);
	cpu_init_interrupts();

	/*
	Index: sys/powerpc/booke/pmap.c
	===================================================================
	--- sys/powerpc/booke/pmap.c
	+++ sys/powerpc/booke/pmap.c
	@@ -1183,7 +1183,7 @@

	pv_entry_count++;
	if (pv_entry_count > pv_entry_high_water)
	- pagedaemon_wakeup();
	+ pagedaemon_wakeup(0); /* XXX powerpc NUMA */
	pv = uma_zalloc(pvzone, M_NOWAIT);

	return (pv);
	Index: sys/powerpc/powerpc/machdep.c
	===================================================================
	--- sys/powerpc/powerpc/machdep.c
	+++ sys/powerpc/powerpc/machdep.c
	@@ -213,8 +213,8 @@
	vm_ksubmap_init(&kmi);

	printf("avail memory = %ju (%ju MB)\n",
	- ptoa((uintmax_t)vm_cnt.v_free_count),
	- ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
	+ ptoa((uintmax_t)vm_free_count()),
	+ ptoa((uintmax_t)vm_free_count()) / 1048576);

	/*
	* Set up buffers, so they can be used to read disk labels.
	Index: sys/sparc64/sparc64/machdep.c
	===================================================================
	--- sys/sparc64/sparc64/machdep.c
	+++ sys/sparc64/sparc64/machdep.c
	@@ -190,8 +190,8 @@
	EVENTHANDLER_REGISTER(shutdown_final, sparc64_shutdown_final, NULL,
	SHUTDOWN_PRI_LAST);

	- printf("avail memory = %lu (%lu MB)\n", vm_cnt.v_free_count * PAGE_SIZE,
	- vm_cnt.v_free_count / ((1024 * 1024) / PAGE_SIZE));
	+ printf("avail memory = %lu (%lu MB)\n", vm_free_count() * PAGE_SIZE,
	+ vm_free_count() / ((1024 * 1024) / PAGE_SIZE));

	if (bootverbose)
	printf("machine: %s\n", sparc64_model);
	Index: sys/sys/vmmeter.h
	===================================================================
	--- sys/sys/vmmeter.h
	+++ sys/sys/vmmeter.h
	@@ -141,23 +141,23 @@
	u_int v_interrupt_free_min; /* (c) reserved pages for int code */
	u_int v_free_severe; /* (c) severe page depletion point */
	u_int v_wire_count VMMETER_ALIGNED; /* (a) pages wired down */
	- u_int v_active_count VMMETER_ALIGNED; /* (a) pages active */
	- u_int v_inactive_count VMMETER_ALIGNED; /* (a) pages inactive */
	- u_int v_laundry_count VMMETER_ALIGNED; /* (a) pages eligible for
	- laundering */
	- u_int v_free_count VMMETER_ALIGNED; /* (f) pages free */
	};
	#endif /* _KERNEL \|\| _WANT_VMMETER */

	#ifdef _KERNEL

	+#include <sys/domainset.h>
	+
	extern struct vmmeter vm_cnt;
	-extern u_int vm_pageout_wakeup_thresh;
	+extern domainset_t vm_min_domains;
	+extern domainset_t vm_severe_domains;

	#define VM_CNT_ADD(var, x) counter_u64_add(vm_cnt.var, x)
	#define VM_CNT_INC(var) VM_CNT_ADD(var, 1)
	#define VM_CNT_FETCH(var) counter_u64_fetch(vm_cnt.var)

	+u_int vm_free_count(void);
	+
	/*
	* Return TRUE if we are under our severe low-free-pages threshold
	*
	@@ -168,7 +168,7 @@
	vm_page_count_severe(void)
	{

	- return (vm_cnt.v_free_severe > vm_cnt.v_free_count);
	+ return (!DOMAINSET_EMPTY(&vm_severe_domains));
	}

	/*
	@@ -184,50 +184,8 @@
	vm_page_count_min(void)
	{

	- return (vm_cnt.v_free_min > vm_cnt.v_free_count);
	+ return (!DOMAINSET_EMPTY(&vm_min_domains));
	}

	-/*
	- * Return TRUE if we have not reached our free page target during
	- * free page recovery operations.
	- */
	-static inline int
	-vm_page_count_target(void)
	-{
	-
	- return (vm_cnt.v_free_target > vm_cnt.v_free_count);
	-}
	-
	-/*
	- * Return the number of pages we need to free-up or cache
	- * A positive number indicates that we do not have enough free pages.
	- */
	-static inline int
	-vm_paging_target(void)
	-{
	-
	- return (vm_cnt.v_free_target - vm_cnt.v_free_count);
	-}
	-
	-/*
	- * Returns TRUE if the pagedaemon needs to be woken up.
	- */
	-static inline int
	-vm_paging_needed(u_int free_count)
	-{
	-
	- return (free_count < vm_pageout_wakeup_thresh);
	-}
	-
	-/*
	- * Return the number of pages we need to launder.
	- * A positive number indicates that we have a shortfall of clean pages.
	- */
	-static inline int
	-vm_laundry_target(void)
	-{
	-
	- return (vm_paging_target());
	-}
	#endif /* _KERNEL */
	#endif /* _SYS_VMMETER_H_ */
	Index: sys/vm/swap_pager.c
	===================================================================
	--- sys/vm/swap_pager.c
	+++ sys/vm/swap_pager.c
	@@ -2327,7 +2327,7 @@
	* of data we will have to page back in, plus an epsilon so
	* the system doesn't become critically low on swap space.
	*/
	- if (vm_cnt.v_free_count + swap_pager_avail < nblks + nswap_lowat)
	+ if (vm_free_count() + swap_pager_avail < nblks + nswap_lowat)
	return (ENOMEM);

	/*
	Index: sys/vm/uma.h
	===================================================================
	--- sys/vm/uma.h
	+++ sys/vm/uma.h
	@@ -47,6 +47,7 @@
	/* Types and type defs */

	struct uma_zone;
	+struct vm_domain_iterator;
	/* Opaque type used as a handle to the zone */
	typedef struct uma_zone * uma_zone_t;

	Index: sys/vm/uma_core.c
	===================================================================
	--- sys/vm/uma_core.c
	+++ sys/vm/uma_core.c
	@@ -3409,7 +3409,7 @@
	slab->us_data = (void *)addr;
	slab->us_flags = UMA_SLAB_KERNEL \| UMA_SLAB_MALLOC;
	slab->us_size = size;
	- slab->us_domain = vm_phys_domidx(PHYS_TO_VM_PAGE(
	+ slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE(
	pmap_kextract(addr)));
	uma_total_inc(size);
	} else {
	Index: sys/vm/vm_extern.h
	===================================================================
	--- sys/vm/vm_extern.h
	+++ sys/vm/vm_extern.h
	@@ -122,5 +122,9 @@
	void vm_imgact_unmap_page(struct sf_buf *sf);
	void vm_thread_dispose(struct thread *td);
	int vm_thread_new(struct thread *td, int pages);
	+u_int vm_active_count(void);
	+u_int vm_inactive_count(void);
	+u_int vm_laundry_count(void);
	+u_int vm_wait_count(void);
	#endif /* _KERNEL */
	#endif /* !_VM_EXTERN_H_ */
	Index: sys/vm/vm_glue.c
	===================================================================
	--- sys/vm/vm_glue.c
	+++ sys/vm/vm_glue.c
	@@ -552,7 +552,7 @@
	}

	while (vm_page_count_severe()) {
	- VM_WAIT;
	+ vm_wait_severe();
	}

	if ((flags & RFMEM) == 0) {
	Index: sys/vm/vm_init.c
	===================================================================
	--- sys/vm/vm_init.c
	+++ sys/vm/vm_init.c
	@@ -89,6 +89,7 @@
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_phys.h>
	+#include <vm/vm_pagequeue.h>
	#include <vm/vm_map.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_extern.h>
	Index: sys/vm/vm_kern.c
	===================================================================
	--- sys/vm/vm_kern.c
	+++ sys/vm/vm_kern.c
	@@ -92,6 +92,7 @@
	#include <vm/vm_page.h>
	#include <vm/vm_pageout.h>
	#include <vm/vm_phys.h>
	+#include <vm/vm_pagequeue.h>
	#include <vm/vm_radix.h>
	#include <vm/vm_extern.h>
	#include <vm/uma.h>
	@@ -196,7 +197,7 @@
	if (!vm_page_reclaim_contig_domain(domain,
	pflags, 1, low, high, PAGE_SIZE, 0) &&
	(flags & M_WAITOK) != 0)
	- VM_WAIT;
	+ vm_wait_domain(domain);
	VM_OBJECT_WLOCK(object);
	tries++;
	goto retry;
	@@ -205,9 +206,9 @@
	vmem_free(vmem, addr, size);
	return (0);
	}
	- KASSERT(vm_phys_domidx(m) == domain,
	+ KASSERT(vm_phys_domain(m) == domain,
	("kmem_alloc_attr_domain: Domain mismatch %d != %d",
	- vm_phys_domidx(m), domain));
	+ vm_phys_domain(m), domain));
	if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
	pmap_zero_page(m);
	m->valid = VM_PAGE_BITS_ALL;
	@@ -280,7 +281,7 @@
	if (!vm_page_reclaim_contig_domain(domain, pflags,
	npages, low, high, alignment, boundary) &&
	(flags & M_WAITOK) != 0)
	- VM_WAIT;
	+ vm_wait_domain(domain);
	VM_OBJECT_WLOCK(object);
	tries++;
	goto retry;
	@@ -288,9 +289,9 @@
	vmem_free(vmem, addr, size);
	return (0);
	}
	- KASSERT(vm_phys_domidx(m) == domain,
	+ KASSERT(vm_phys_domain(m) == domain,
	("kmem_alloc_contig_domain: Domain mismatch %d != %d",
	- vm_phys_domidx(m), domain));
	+ vm_phys_domain(m), domain));
	end_m = m + npages;
	tmp = addr;
	for (; m < end_m; m++) {
	@@ -452,9 +453,9 @@
	kmem_unback(object, addr, i);
	return (KERN_NO_SPACE);
	}
	- KASSERT(vm_phys_domidx(m) == domain,
	+ KASSERT(vm_phys_domain(m) == domain,
	("kmem_back_domain: Domain mismatch %d != %d",
	- vm_phys_domidx(m), domain));
	+ vm_phys_domain(m), domain));
	if (flags & M_ZERO && (m->flags & PG_ZERO) == 0)
	pmap_zero_page(m);
	KASSERT((m->oflags & VPO_UNMANAGED) != 0,
	@@ -514,7 +515,7 @@
	end = offset + size;
	VM_OBJECT_WLOCK(object);
	m = vm_page_lookup(object, atop(offset));
	- domain = vm_phys_domidx(m);
	+ domain = vm_phys_domain(m);
	for (; offset < end; offset += PAGE_SIZE, m = next) {
	next = vm_page_next(m);
	vm_page_unwire(m, PQ_NONE);
	Index: sys/vm/vm_map.c
	===================================================================
	--- sys/vm/vm_map.c
	+++ sys/vm/vm_map.c
	@@ -2016,7 +2016,7 @@
	* free pages allocating pv entries.
	*/
	if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
	- vm_cnt.v_free_count < vm_cnt.v_free_reserved) \|\|
	+ vm_page_count_severe()) \|\|
	((flags & MAP_PREFAULT_PARTIAL) != 0 &&
	tmpidx >= threshold)) {
	psize = tmpidx;
	Index: sys/vm/vm_meter.c
	===================================================================
	--- sys/vm/vm_meter.c
	+++ sys/vm/vm_meter.c
	@@ -53,6 +53,8 @@
	#include <vm/vm_page.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_param.h>
	+#include <vm/vm_phys.h>
	+#include <vm/vm_pagequeue.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	@@ -213,9 +215,6 @@
	total.t_dw++;
	else
	total.t_sl++;
	- if (td->td_wchan ==
	- &vm_cnt.v_free_count)
	- total.t_pw++;
	}
	break;
	case TDS_CAN_RUN:
	@@ -283,7 +282,8 @@
	}
	}
	mtx_unlock(&vm_object_list_mtx);
	- total.t_free = vm_cnt.v_free_count;
	+ total.t_pw = vm_wait_count();
	+ total.t_free = vm_free_count();
	#if defined(COMPAT_FREEBSD11)
	/* sysctl(8) allocates twice as much memory as reported by sysctl(3) */
	if (curproc->p_osrel < P_OSREL_VMTOTAL64 && (req->oldlen ==
	@@ -339,7 +339,7 @@

	#define VM_STATS(parent, var, descr) \
	SYSCTL_OID(parent, OID_AUTO, var, CTLTYPE_U64 \| CTLFLAG_MPSAFE \| \
	- CTLFLAG_RD, &vm_cnt.var, 0, sysctl_handle_vmstat, "QU", descr);
	+ CTLFLAG_RD, &vm_cnt.var, 0, sysctl_handle_vmstat, "QU", descr)
	#define VM_STATS_VM(var, descr) VM_STATS(_vm_stats_vm, var, descr)
	#define VM_STATS_SYS(var, descr) VM_STATS(_vm_stats_sys, var, descr)

	@@ -379,19 +379,36 @@
	VM_STATS_VM(v_rforkpages, "VM pages affected by rfork()");
	VM_STATS_VM(v_kthreadpages, "VM pages affected by fork() by kernel");

	+static int
	+sysctl_handle_vmstat_proc(SYSCTL_HANDLER_ARGS)
	+{
	+ u_int (*fn)(void);
	+ uint32_t val;
	+
	+ fn = arg1;
	+ val = fn();
	+ return (SYSCTL_OUT(req, &val, sizeof(val)));
	+}
	+
	+#define VM_STATS_PROC(var, descr, fn) \
	+ SYSCTL_OID(_vm_stats_vm, OID_AUTO, var, CTLTYPE_U32 \| CTLFLAG_MPSAFE \| \
	+ CTLFLAG_RD, fn, 0, sysctl_handle_vmstat_proc, "IU", descr)
	+
	#define VM_STATS_UINT(var, descr) \
	SYSCTL_UINT(_vm_stats_vm, OID_AUTO, var, CTLFLAG_RD, &vm_cnt.var, 0, descr)
	+
	VM_STATS_UINT(v_page_size, "Page size in bytes");
	VM_STATS_UINT(v_page_count, "Total number of pages in system");
	VM_STATS_UINT(v_free_reserved, "Pages reserved for deadlock");
	VM_STATS_UINT(v_free_target, "Pages desired free");
	VM_STATS_UINT(v_free_min, "Minimum low-free-pages threshold");
	-VM_STATS_UINT(v_free_count, "Free pages");
	+VM_STATS_PROC(v_free_count, "Free pages", vm_free_count);
	VM_STATS_UINT(v_wire_count, "Wired pages");
	-VM_STATS_UINT(v_active_count, "Active pages");
	+VM_STATS_PROC(v_active_count, "Active pages", vm_active_count);
	VM_STATS_UINT(v_inactive_target, "Desired inactive pages");
	-VM_STATS_UINT(v_inactive_count, "Inactive pages");
	-VM_STATS_UINT(v_laundry_count, "Pages eligible for laundering");
	+VM_STATS_PROC(v_inactive_count, "Inactive pages", vm_inactive_count);
	+VM_STATS_PROC(v_laundry_count, "Pages eligible for laundering",
	+ vm_laundry_count);
	VM_STATS_UINT(v_pageout_free_min, "Min pages reserved for kernel");
	VM_STATS_UINT(v_interrupt_free_min, "Reserved pages for interrupt code");
	VM_STATS_UINT(v_free_severe, "Severe page depletion point");
	@@ -406,3 +423,52 @@
	SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_tcached, CTLFLAG_RD,
	SYSCTL_NULL_UINT_PTR, 0, "Dummy for compatibility");
	#endif
	+
	+u_int
	+vm_free_count(void)
	+{
	+ u_int v;
	+ int i;
	+
	+ v = 0;
	+ for (i = 0; i < vm_ndomains; i++)
	+ v += vm_dom[i].vmd_free_count;
	+
	+ return (v);
	+}
	+
	+static
	+u_int
	+vm_pagequeue_count(int pq)
	+{
	+ u_int v;
	+ int i;
	+
	+ v = 0;
	+ for (i = 0; i < vm_ndomains; i++)
	+ v += vm_dom[i].vmd_pagequeues[pq].pq_cnt;
	+
	+ return (v);
	+}
	+
	+u_int
	+vm_active_count(void)
	+{
	+
	+ return vm_pagequeue_count(PQ_ACTIVE);
	+}
	+
	+u_int
	+vm_inactive_count(void)
	+{
	+
	+ return vm_pagequeue_count(PQ_INACTIVE);
	+}
	+
	+u_int
	+vm_laundry_count(void)
	+{
	+
	+ return vm_pagequeue_count(PQ_LAUNDRY);
	+}
	+
	Index: sys/vm/vm_object.h
	===================================================================
	--- sys/vm/vm_object.h
	+++ sys/vm/vm_object.h
	@@ -297,6 +297,17 @@
	}
	}

	+static __inline bool
	+vm_object_reserv(vm_object_t object)
	+{
	+
	+ if (object != NULL &&
	+ (object->flags & (OBJ_COLORED \| OBJ_FICTITIOUS)) == OBJ_COLORED) {
	+ return (true);
	+ }
	+ return (false);
	+}
	+
	void vm_object_clear_flag(vm_object_t object, u_short bits);
	void vm_object_pip_add(vm_object_t object, short i);
	void vm_object_pip_subtract(vm_object_t object, short i);
	Index: sys/vm/vm_object.c
	===================================================================
	--- sys/vm/vm_object.c
	+++ sys/vm/vm_object.c
	@@ -96,6 +96,8 @@
	#include <vm/vm_page.h>
	#include <vm/vm_pageout.h>
	#include <vm/vm_pager.h>
	+#include <vm/vm_phys.h>
	+#include <vm/vm_pagequeue.h>
	#include <vm/swap_pager.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_extern.h>
	Index: sys/vm/vm_page.h
	===================================================================
	--- sys/vm/vm_page.h
	+++ sys/vm/vm_page.h
	@@ -218,54 +218,10 @@
	#endif
	SLIST_HEAD(spglist, vm_page);

	-struct vm_pagequeue {
	- struct mtx pq_mutex;
	- struct pglist pq_pl;
	- int pq_cnt;
	- u_int * const pq_vcnt;
	- const char * const pq_name;
	-} __aligned(CACHE_LINE_SIZE);
	-
	-
	-struct vm_domain {
	- struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
	- struct vmem *vmd_kernel_arena;
	- u_int vmd_page_count;
	- u_int vmd_free_count;
	- long vmd_segs; /* bitmask of the segments */
	- boolean_t vmd_oom;
	- int vmd_oom_seq;
	- int vmd_last_active_scan;
	- struct vm_page vmd_laundry_marker;
	- struct vm_page vmd_marker; /* marker for pagedaemon private use */
	- struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */
	-};
	-
	-extern struct vm_domain vm_dom[MAXMEMDOM];
	-
	-#define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED)
	-#define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex)
	-#define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex)
	-#define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex)
	-
	#ifdef _KERNEL
	extern vm_page_t bogus_page;
	-
	-static __inline void
	-vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend)
	-{
	-
	-#ifdef notyet
	- vm_pagequeue_assert_locked(pq);
	-#endif
	- pq->pq_cnt += addend;
	- atomic_add_int(pq->pq_vcnt, addend);
	-}
	-#define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1)
	-#define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1)
	#endif /* _KERNEL */

	-extern struct mtx_padalign vm_page_queue_free_mtx;
	extern struct mtx_padalign pa_lock[];

	#if defined(__arm__)
	Index: sys/vm/vm_page.c
	===================================================================
	--- sys/vm/vm_page.c
	+++ sys/vm/vm_page.c
	@@ -115,8 +115,9 @@
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pageout.h>
	-#include <vm/vm_pager.h>
	#include <vm/vm_phys.h>
	+#include <vm/vm_pagequeue.h>
	+#include <vm/vm_pager.h>
	#include <vm/vm_radix.h>
	#include <vm/vm_reserv.h>
	#include <vm/vm_extern.h>
	@@ -131,10 +132,16 @@
	*/

	struct vm_domain vm_dom[MAXMEMDOM];
	-struct mtx_padalign __exclusive_cache_line vm_page_queue_free_mtx;

	struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT];
	+struct mtx_padalign __exclusive_cache_line vm_domainset_lock;
	+domainset_t __exclusive_cache_line vm_min_domains;
	+domainset_t __exclusive_cache_line vm_severe_domains;
	+static int vm_min_waiters;
	+static int vm_severe_waiters;
	+static int vm_pageproc_waiters;

	+
	/*
	* bogus page -- for I/O to/from partially complete buffers,
	* or for paging into sparsely invalid regions.
	@@ -159,24 +166,22 @@
	SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING \| CTLFLAG_RD \|
	CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages");

	-/* Is the page daemon waiting for free pages? */
	-static int vm_pageout_pages_needed;
	-
	static uma_zone_t fakepg_zone;

	static void vm_page_alloc_check(vm_page_t m);
	static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
	static void vm_page_enqueue(uint8_t queue, vm_page_t m);
	static void vm_page_free_phys(vm_page_t m);
	-static void vm_page_free_wakeup(void);
	static void vm_page_init(void *dummy);
	static int vm_page_insert_after(vm_page_t m, vm_object_t object,
	vm_pindex_t pindex, vm_page_t mpred);
	static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
	vm_page_t mpred);
	-static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
	- vm_paddr_t high);
	-static int vm_page_alloc_fail(vm_object_t object, int req);
	+static int vm_page_reclaim_run(int req_class, int domain, u_long npages,
	+ vm_page_t m_run, vm_paddr_t high);
	+static void vm_domain_free_wakeup(struct vm_domain *);
	+static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object,
	+ int req);

	SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL);

	@@ -313,6 +318,7 @@
	static void
	vm_page_blacklist_check(char list, char end)
	{
	+ struct vm_domain *vmd;
	vm_paddr_t pa;
	vm_page_t m;
	char *next;
	@@ -325,9 +331,10 @@
	m = vm_phys_paddr_to_vm_page(pa);
	if (m == NULL)
	continue;
	- mtx_lock(&vm_page_queue_free_mtx);
	+ vmd = vm_pagequeue_domain(m);
	+ vm_domain_free_lock(vmd);
	ret = vm_phys_unfree_page(m);
	- mtx_unlock(&vm_page_queue_free_mtx);
	+ vm_domain_free_unlock(vmd);
	if (ret == TRUE) {
	TAILQ_INSERT_TAIL(&blacklist_head, m, listq);
	if (bootverbose)
	@@ -390,28 +397,23 @@
	}

	static void
	-vm_page_domain_init(struct vm_domain *vmd)
	+vm_page_domain_init(int domain)
	{
	+ struct vm_domain *vmd;
	struct vm_pagequeue *pq;
	int i;

	+ vmd = VM_DOMAIN(domain);
	+ bzero(vmd, sizeof(*vmd));
	__DECONST(char *, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
	"vm inactive pagequeue";
	- __DECONST(u_int *, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
	- &vm_cnt.v_inactive_count;
	__DECONST(char *, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
	"vm active pagequeue";
	- __DECONST(u_int *, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
	- &vm_cnt.v_active_count;
	__DECONST(char *, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) =
	"vm laundry pagequeue";
	- __DECONST(int *, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) =
	- &vm_cnt.v_laundry_count;
	__DECONST(char *, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) =
	"vm unswappable pagequeue";
	- /* Unswappable dirty pages are counted as being in the laundry. */
	- __DECONST(int *, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_vcnt) =
	- &vm_cnt.v_laundry_count;
	+ vmd->vmd_domain = domain;
	vmd->vmd_page_count = 0;
	vmd->vmd_free_count = 0;
	vmd->vmd_segs = 0;
	@@ -422,6 +424,7 @@
	mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
	MTX_DEF \| MTX_DUPOK);
	}
	+ mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF);
	}

	/*
	@@ -458,7 +461,6 @@
	vm_offset_t
	vm_page_startup(vm_offset_t vaddr)
	{
	- struct vm_domain *vmd;
	struct vm_phys_seg *seg;
	vm_page_t m;
	char list, listend;
	@@ -489,11 +491,11 @@
	/*
	* Initialize the page and queue locks.
	*/
	- mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF);
	+ mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF);
	for (i = 0; i < PA_LOCK_COUNT; i++)
	mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
	for (i = 0; i < vm_ndomains; i++)
	- vm_page_domain_init(&vm_dom[i]);
	+ vm_page_domain_init(i);

	/*
	* Almost all of the pages needed for bootstrapping UMA are used
	@@ -691,7 +693,6 @@
	* physical memory allocator's free lists.
	*/
	vm_cnt.v_page_count = 0;
	- vm_cnt.v_free_count = 0;
	for (segind = 0; segind < vm_phys_nsegs; segind++) {
	seg = &vm_phys_segs[segind];
	for (m = seg->first_page, pa = seg->start; pa < seg->end;
	@@ -706,6 +707,8 @@
	* or doesn't overlap any of them.
	*/
	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
	+ struct vm_domain *vmd;
	+
	if (seg->start < phys_avail[i] \|\|
	seg->end > phys_avail[i + 1])
	continue;
	@@ -713,13 +716,14 @@
	m = seg->first_page;
	pagecount = (u_long)atop(seg->end - seg->start);

	- mtx_lock(&vm_page_queue_free_mtx);
	+ vmd = VM_DOMAIN(seg->domain);
	+ vm_domain_free_lock(vmd);
	vm_phys_free_contig(m, pagecount);
	- vm_phys_freecnt_adj(m, (int)pagecount);
	- mtx_unlock(&vm_page_queue_free_mtx);
	+ vm_domain_freecnt_adj(vmd, (int)pagecount);
	+ vm_domain_free_unlock(vmd);
	vm_cnt.v_page_count += (u_int)pagecount;

	- vmd = &vm_dom[seg->domain];
	+ vmd = VM_DOMAIN(seg->domain);;
	vmd->vmd_page_count += (u_int)pagecount;
	vmd->vmd_segs \|= 1UL << m->segind;
	break;
	@@ -1644,12 +1648,40 @@
	return (m);
	}

	+/*
	+ * Returns true if the number of free pages exceeds the minimum
	+ * for the request class and false otherwise.
	+ */
	+int
	+vm_domain_available(struct vm_domain *vmd, int req, int npages)
	+{
	+
	+ vm_domain_free_assert_locked(vmd);
	+ req = req & VM_ALLOC_CLASS_MASK;
	+
	+ /*
	+ * The page daemon is allowed to dig deeper into the free page list.
	+ */
	+ if (curproc == pageproc && req != VM_ALLOC_INTERRUPT)
	+ req = VM_ALLOC_SYSTEM;
	+
	+ if (vmd->vmd_free_count >= npages + vmd->vmd_free_reserved \|\|
	+ (req == VM_ALLOC_SYSTEM &&
	+ vmd->vmd_free_count >= npages + vmd->vmd_interrupt_free_min) \|\|
	+ (req == VM_ALLOC_INTERRUPT &&
	+ vmd->vmd_free_count >= npages))
	+ return (1);
	+
	+ return (0);
	+}
	+
	vm_page_t
	vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain,
	int req, vm_page_t mpred)
	{
	+ struct vm_domain *vmd;
	vm_page_t m;
	- int flags, req_class;
	+ int flags;
	u_int free_count;

	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
	@@ -1665,34 +1697,27 @@
	if (object != NULL)
	VM_OBJECT_ASSERT_WLOCKED(object);

	- req_class = req & VM_ALLOC_CLASS_MASK;
	-
	- /*
	- * The page daemon is allowed to dig deeper into the free page list.
	- */
	- if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
	- req_class = VM_ALLOC_SYSTEM;
	-
	- /*
	- * Allocate a page if the number of free pages exceeds the minimum
	- * for the request class.
	- */
	again:
	m = NULL;
	- mtx_lock(&vm_page_queue_free_mtx);
	- if (vm_cnt.v_free_count > vm_cnt.v_free_reserved \|\|
	- (req_class == VM_ALLOC_SYSTEM &&
	- vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) \|\|
	- (req_class == VM_ALLOC_INTERRUPT &&
	- vm_cnt.v_free_count > 0)) {
	+#if VM_NRESERVLEVEL > 0
	+ if (vm_object_reserv(object) &&
	+ (m = vm_reserv_extend(req, object, pindex, domain, mpred))
	+ != NULL) {
	+ domain = vm_phys_domain(m);
	+ vmd = VM_DOMAIN(domain);
	+ goto found;
	+ }
	+#endif
	+ vmd = VM_DOMAIN(domain);
	+ vm_domain_free_lock(vmd);
	+ if (vm_domain_available(vmd, req, 1)) {
	/*
	* Can we allocate the page from a reservation?
	*/
	#if VM_NRESERVLEVEL > 0
	- if (object == NULL \|\| (object->flags & (OBJ_COLORED \|
	- OBJ_FICTITIOUS)) != OBJ_COLORED \|\| (m =
	- vm_reserv_alloc_page(object, pindex, domain,
	- mpred)) == NULL)
	+ if (!vm_object_reserv(object) \|\|
	+ (m = vm_reserv_alloc_page(object, pindex,
	+ domain, mpred)) == NULL)
	#endif
	{
	/*
	@@ -1714,7 +1739,7 @@
	/*
	* Not allocatable, give up.
	*/
	- if (vm_page_alloc_fail(object, req))
	+ if (vm_domain_alloc_fail(vmd, object, req))
	goto again;
	return (NULL);
	}
	@@ -1723,8 +1748,18 @@
	* At this point we had better have found a good page.
	*/
	KASSERT(m != NULL, ("missing page"));
	- free_count = vm_phys_freecnt_adj(m, -1);
	- mtx_unlock(&vm_page_queue_free_mtx);
	+ free_count = vm_domain_freecnt_adj(vmd, -1);
	+ vm_domain_free_unlock(vmd);
	+
	+ /*
	+ * Don't wakeup too often - wakeup the pageout daemon when
	+ * we would be nearly out of memory.
	+ */
	+ if (vm_paging_needed(vmd, free_count))
	+ pagedaemon_wakeup(vmd->vmd_domain);
	+#if VM_NRESERVLEVEL > 0
	+found:
	+#endif
	vm_page_alloc_check(m);

	/*
	@@ -1757,7 +1792,7 @@

	if (object != NULL) {
	if (vm_page_insert_after(m, object, pindex, mpred)) {
	- pagedaemon_wakeup();
	+ pagedaemon_wakeup(domain);
	if (req & VM_ALLOC_WIRED) {
	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
	m->wire_count = 0;
	@@ -1782,13 +1817,6 @@
	} else
	m->pindex = pindex;

	- /*
	- * Don't wakeup too often - wakeup the pageout daemon when
	- * we would be nearly out of memory.
	- */
	- if (vm_paging_needed(free_count))
	- pagedaemon_wakeup();
	-
	return (m);
	}

	@@ -1856,9 +1884,9 @@
	int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
	vm_paddr_t boundary, vm_memattr_t memattr)
	{
	+ struct vm_domain *vmd;
	vm_page_t m, m_ret, mpred;
	u_int busy_lock, flags, oflags;
	- int req_class;

	mpred = NULL; /* XXX: pacify gcc */
	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
	@@ -1876,14 +1904,7 @@
	object));
	}
	KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
	- req_class = req & VM_ALLOC_CLASS_MASK;

	- /*
	- * The page daemon is allowed to dig deeper into the free page list.
	- */
	- if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
	- req_class = VM_ALLOC_SYSTEM;
	-
	if (object != NULL) {
	mpred = vm_radix_lookup_le(&object->rtree, pindex);
	KASSERT(mpred == NULL \|\| mpred->pindex != pindex,
	@@ -1895,19 +1916,25 @@
	* below the lower bound for the allocation class?
	*/
	again:
	+#if VM_NRESERVLEVEL > 0
	+ if (vm_object_reserv(object) &&
	+ (m_ret = vm_reserv_extend_contig(req, object, pindex, domain,
	+ npages, low, high, alignment, boundary, mpred)) != NULL) {
	+ domain = vm_phys_domain(m_ret);
	+ vmd = VM_DOMAIN(domain);
	+ goto found;
	+ }
	+#endif
	m_ret = NULL;
	- mtx_lock(&vm_page_queue_free_mtx);
	- if (vm_cnt.v_free_count >= npages + vm_cnt.v_free_reserved \|\|
	- (req_class == VM_ALLOC_SYSTEM &&
	- vm_cnt.v_free_count >= npages + vm_cnt.v_interrupt_free_min) \|\|
	- (req_class == VM_ALLOC_INTERRUPT &&
	- vm_cnt.v_free_count >= npages)) {
	+ vmd = VM_DOMAIN(domain);
	+ vm_domain_free_lock(vmd);
	+ if (vm_domain_available(vmd, req, npages)) {
	/*
	* Can we allocate the pages from a reservation?
	*/
	#if VM_NRESERVLEVEL > 0
	retry:
	- if (object == NULL \|\| (object->flags & OBJ_COLORED) == 0 \|\|
	+ if (!vm_object_reserv(object) \|\|
	(m_ret = vm_reserv_alloc_contig(object, pindex, domain,
	npages, low, high, alignment, boundary, mpred)) == NULL)
	#endif
	@@ -1923,12 +1950,15 @@
	#endif
	}
	if (m_ret == NULL) {
	- if (vm_page_alloc_fail(object, req))
	+ if (vm_domain_alloc_fail(vmd, object, req))
	goto again;
	return (NULL);
	}
	- vm_phys_freecnt_adj(m_ret, -npages);
	- mtx_unlock(&vm_page_queue_free_mtx);
	+ vm_domain_freecnt_adj(vmd, -npages);
	+ vm_domain_free_unlock(vmd);
	+#if VM_NRESERVLEVEL > 0
	+found:
	+#endif
	for (m = m_ret; m < &m_ret[npages]; m++)
	vm_page_alloc_check(m);

	@@ -1964,7 +1994,7 @@
	m->oflags = oflags;
	if (object != NULL) {
	if (vm_page_insert_after(m, object, pindex, mpred)) {
	- pagedaemon_wakeup();
	+ pagedaemon_wakeup(domain);
	if ((req & VM_ALLOC_WIRED) != 0)
	atomic_subtract_int(
	&vm_cnt.v_wire_count, npages);
	@@ -1994,8 +2024,9 @@
	pmap_page_set_memattr(m, memattr);
	pindex++;
	}
	- if (vm_paging_needed(vm_cnt.v_free_count))
	- pagedaemon_wakeup();
	+ vmd = VM_DOMAIN(domain);
	+ if (vm_paging_needed(vmd, vmd->vmd_free_count))
	+ pagedaemon_wakeup(domain);
	return (m_ret);
	}

	@@ -2057,37 +2088,26 @@
	vm_page_t
	vm_page_alloc_freelist_domain(int domain, int freelist, int req)
	{
	+ struct vm_domain *vmd;
	vm_page_t m;
	u_int flags, free_count;
	- int req_class;

	- req_class = req & VM_ALLOC_CLASS_MASK;
	-
	/*
	- * The page daemon is allowed to dig deeper into the free page list.
	- */
	- if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
	- req_class = VM_ALLOC_SYSTEM;
	-
	- /*
	* Do not allocate reserved pages unless the req has asked for it.
	*/
	+ vmd = VM_DOMAIN(domain);
	again:
	- mtx_lock(&vm_page_queue_free_mtx);
	- if (vm_cnt.v_free_count > vm_cnt.v_free_reserved \|\|
	- (req_class == VM_ALLOC_SYSTEM &&
	- vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) \|\|
	- (req_class == VM_ALLOC_INTERRUPT &&
	- vm_cnt.v_free_count > 0))
	+ vm_domain_free_lock(vmd);
	+ if (vm_domain_available(vmd, req, 1))
	m = vm_phys_alloc_freelist_pages(domain, freelist,
	VM_FREEPOOL_DIRECT, 0);
	if (m == NULL) {
	- if (vm_page_alloc_fail(NULL, req))
	+ if (vm_domain_alloc_fail(vmd, NULL, req))
	goto again;
	return (NULL);
	}
	- free_count = vm_phys_freecnt_adj(m, -1);
	- mtx_unlock(&vm_page_queue_free_mtx);
	+ free_count = vm_domain_freecnt_adj(vmd, -1);
	+ vm_domain_free_unlock(vmd);
	vm_page_alloc_check(m);

	/*
	@@ -2108,8 +2128,8 @@
	}
	/* Unmanaged pages don't use "act_count". */
	m->oflags = VPO_UNMANAGED;
	- if (vm_paging_needed(free_count))
	- pagedaemon_wakeup();
	+ if (vm_paging_needed(vmd, free_count))
	+ pagedaemon_wakeup(domain);
	return (m);
	}

	@@ -2331,9 +2351,10 @@
	* "req_class" must be an allocation class.
	*/
	static int
	-vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
	+vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
	vm_paddr_t high)
	{
	+ struct vm_domain *vmd;
	struct mtx *m_mtx;
	struct spglist free;
	vm_object_t object;
	@@ -2483,7 +2504,9 @@
	unlock:
	VM_OBJECT_WUNLOCK(object);
	} else {
	- mtx_lock(&vm_page_queue_free_mtx);
	+ MPASS(vm_phys_domain(m) == domain);
	+ vmd = VM_DOMAIN(domain);
	+ vm_domain_free_lock(vmd);
	order = m->order;
	if (order < VM_NFREEORDER) {
	/*
	@@ -2500,7 +2523,7 @@
	else if (vm_reserv_is_page_free(m))
	order = 0;
	#endif
	- mtx_unlock(&vm_page_queue_free_mtx);
	+ vm_domain_free_unlock(vmd);
	if (order == VM_NFREEORDER)
	error = EINVAL;
	}
	@@ -2508,13 +2531,15 @@
	if (m_mtx != NULL)
	mtx_unlock(m_mtx);
	if ((m = SLIST_FIRST(&free)) != NULL) {
	- mtx_lock(&vm_page_queue_free_mtx);
	+ vmd = VM_DOMAIN(domain);
	+ vm_domain_free_lock(vmd);
	do {
	+ MPASS(vm_phys_domain(m) == domain);
	SLIST_REMOVE_HEAD(&free, plinks.s.ss);
	vm_page_free_phys(m);
	} while ((m = SLIST_FIRST(&free)) != NULL);
	- vm_page_free_wakeup();
	- mtx_unlock(&vm_page_queue_free_mtx);
	+ vm_domain_free_wakeup(vmd);
	+ vm_domain_free_unlock(vmd);
	}
	return (error);
	}
	@@ -2554,6 +2579,7 @@
	vm_page_reclaim_contig_domain(int domain, int req, u_long npages,
	vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
	{
	+ struct vm_domain *vmd;
	vm_paddr_t curr_low;
	vm_page_t m_run, m_runs[NRUNS];
	u_long count, reclaimed;
	@@ -2574,9 +2600,10 @@
	* Return if the number of free pages cannot satisfy the requested
	* allocation.
	*/
	- count = vm_cnt.v_free_count;
	- if (count < npages + vm_cnt.v_free_reserved \|\| (count < npages +
	- vm_cnt.v_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) \|\|
	+ vmd = VM_DOMAIN(domain);
	+ count = vmd->vmd_free_count;
	+ if (count < npages + vmd->vmd_free_reserved \|\| (count < npages +
	+ vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) \|\|
	(count < npages && req_class == VM_ALLOC_INTERRUPT))
	return (false);

	@@ -2612,8 +2639,8 @@
	for (i = 0; count > 0 && i < NRUNS; i++) {
	count--;
	m_run = m_runs[RUN_INDEX(count)];
	- error = vm_page_reclaim_run(req_class, npages, m_run,
	- high);
	+ error = vm_page_reclaim_run(req_class, domain, npages,
	+ m_run, high);
	if (error == 0) {
	reclaimed += npages;
	if (reclaimed >= MIN_RECLAIM)
	@@ -2653,66 +2680,190 @@
	return (ret);
	}

	+/*
	+ * Set the domain in the appropriate page level domainset.
	+ */
	+void
	+vm_domain_set(struct vm_domain *vmd)
	+{

	+ mtx_lock(&vm_domainset_lock);
	+ if (!vmd->vmd_minset && vm_paging_min(vmd)) {
	+ vmd->vmd_minset = 1;
	+ DOMAINSET_SET(vmd->vmd_domain, &vm_min_domains);
	+ }
	+ if (!vmd->vmd_severeset && vm_paging_severe(vmd)) {
	+ vmd->vmd_severeset = 1;
	+ DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains);
	+ }
	+ mtx_unlock(&vm_domainset_lock);
	+}
	+
	/*
	- * vm_wait: (also see VM_WAIT macro)
	+ * Clear the domain from the appropriate page level domainset.
	+ */
	+static void
	+vm_domain_clear(struct vm_domain *vmd)
	+{
	+
	+ mtx_lock(&vm_domainset_lock);
	+ if (vmd->vmd_minset && !vm_paging_min(vmd)) {
	+ vmd->vmd_minset = 0;
	+ DOMAINSET_CLR(vmd->vmd_domain, &vm_min_domains);
	+ if (vm_min_waiters != 0) {
	+ vm_min_waiters = 0;
	+ wakeup(&vm_min_domains);
	+ }
	+ }
	+ if (vmd->vmd_severeset && !vm_paging_severe(vmd)) {
	+ vmd->vmd_severeset = 0;
	+ DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains);
	+ if (vm_severe_waiters != 0) {
	+ vm_severe_waiters = 0;
	+ wakeup(&vm_severe_domains);
	+ }
	+ }
	+ mtx_unlock(&vm_domainset_lock);
	+}
	+
	+/*
	+ * Wait for free pages to exceed the min threshold globally.
	+ */
	+void
	+vm_wait_min(void)
	+{
	+
	+ mtx_lock(&vm_domainset_lock);
	+ while (vm_page_count_min()) {
	+ vm_min_waiters++;
	+ msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0);
	+ }
	+ mtx_unlock(&vm_domainset_lock);
	+}
	+
	+/*
	+ * Wait for free pages to exceed the severe threshold globally.
	+ */
	+void
	+vm_wait_severe(void)
	+{
	+
	+ mtx_lock(&vm_domainset_lock);
	+ while (vm_page_count_severe()) {
	+ vm_severe_waiters++;
	+ msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0);
	+ }
	+ mtx_unlock(&vm_domainset_lock);
	+}
	+
	+u_int
	+vm_wait_count(void)
	+{
	+ u_int cnt;
	+ int i;
	+
	+ cnt = 0;
	+ for (i = 0; i < vm_ndomains; i++)
	+ cnt += VM_DOMAIN(i)->vmd_waiters;
	+ cnt += vm_severe_waiters + vm_min_waiters;
	+
	+ return (cnt);
	+}
	+
	+/*
	+ * vm_wait_domain:
	*
	* Sleep until free pages are available for allocation.
	- * - Called in various places before memory allocations.
	+ * - Called in various places after failed memory allocations.
	*/
	-static void
	-_vm_wait(void)
	+void
	+vm_wait_domain(int domain)
	{
	+ struct vm_domain *vmd;

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	+ vmd = VM_DOMAIN(domain);
	+ vm_domain_free_assert_locked(vmd);
	+
	if (curproc == pageproc) {
	- vm_pageout_pages_needed = 1;
	- msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
	- PDROP \| PSWP, "VMWait", 0);
	+ vmd->vmd_pageout_pages_needed = 1;
	+ msleep(&vmd->vmd_pageout_pages_needed,
	+ vm_domain_free_lockptr(vmd), PDROP \| PSWP, "VMWait", 0);
	} else {
	if (pageproc == NULL)
	panic("vm_wait in early boot");
	- pagedaemon_wait(PVM, "vmwait");
	+ pagedaemon_wait(domain, PVM, "vmwait");
	}
	}

	+/*
	+ * vm_wait: (also see VM_WAIT macro)
	+ *
	+ * Sleep until free pages are available for allocation.
	+ * - Called in various places after failed memory allocations.
	+ */
	void
	vm_wait(void)
	{

	- mtx_lock(&vm_page_queue_free_mtx);
	- _vm_wait();
	+ /*
	+ * We use racey wakeup synchronization to avoid expensive global
	+ * locking for the pageproc when sleeping with a non-specific vm_wait.
	+ * To handle this, we only sleep for one tick in this instance. It
	+ * is expected that most allocations for the pageproc will come from
	+ * kmem or vm_page_grab* which will use the more specific and
	+ * race-free vm_wait_domain().
	+ */
	+ if (curproc == pageproc) {
	+ mtx_lock(&vm_domainset_lock);
	+ vm_pageproc_waiters++;
	+ msleep(&vm_pageproc_waiters, &vm_domainset_lock, PVM,
	+ "pageprocwait", 1);
	+ mtx_unlock(&vm_domainset_lock);
	+ } else {
	+ /*
	+ * XXX Ideally we would wait only until the allocation could
	+ * be satisfied. This condition can cause new allocators to
	+ * consume all freed pages while old allocators wait.
	+ */
	+ mtx_lock(&vm_domainset_lock);
	+ if (vm_page_count_min()) {
	+ vm_min_waiters++;
	+ msleep(&vm_min_domains, &vm_domainset_lock, PVM,
	+ "vmwait", 0);
	+ }
	+ mtx_unlock(&vm_domainset_lock);
	+ }
	}

	/*
	- * vm_page_alloc_fail:
	+ * vm_domain_alloc_fail:
	*
	* Called when a page allocation function fails. Informs the
	* pagedaemon and performs the requested wait. Requires the
	- * page_queue_free and object lock on entry. Returns with the
	+ * domain_free and object lock on entry. Returns with the
	* object lock held and free lock released. Returns an error when
	* retry is necessary.
	*
	*/
	static int
	-vm_page_alloc_fail(vm_object_t object, int req)
	+vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req)
	{

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	+ vm_domain_free_assert_locked(vmd);

	- atomic_add_int(&vm_pageout_deficit,
	+ atomic_add_int(&vmd->vmd_pageout_deficit,
	max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
	if (req & (VM_ALLOC_WAITOK \| VM_ALLOC_WAITFAIL)) {
	if (object != NULL)
	VM_OBJECT_WUNLOCK(object);
	- _vm_wait();
	+ vm_wait_domain(vmd->vmd_domain);
	if (object != NULL)
	VM_OBJECT_WLOCK(object);
	if (req & VM_ALLOC_WAITOK)
	return (EAGAIN);
	} else {
	- mtx_unlock(&vm_page_queue_free_mtx);
	- pagedaemon_wakeup();
	+ vm_domain_free_unlock(vmd);
	+ pagedaemon_wakeup(vmd->vmd_domain);
	}
	return (0);
	}
	@@ -2731,18 +2882,19 @@
	vm_waitpfault(void)
	{

	- mtx_lock(&vm_page_queue_free_mtx);
	- pagedaemon_wait(PUSER, "pfault");
	+ mtx_lock(&vm_domainset_lock);
	+ if (vm_page_count_min()) {
	+ vm_min_waiters++;
	+ msleep(&vm_min_domains, &vm_domainset_lock, PUSER, "pfault", 0);
	+ }
	+ mtx_unlock(&vm_domainset_lock);
	}

	struct vm_pagequeue *
	vm_page_pagequeue(vm_page_t m)
	{

	- if (vm_page_in_laundry(m))
	- return (&vm_dom[0].vmd_pagequeues[m->queue]);
	- else
	- return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
	+ return (&vm_pagequeue_domain(m)->vmd_pagequeues[m->queue]);
	}

	/*
	@@ -2804,10 +2956,7 @@
	KASSERT(queue < PQ_COUNT,
	("vm_page_enqueue: invalid queue %u request for page %p",
	queue, m));
	- if (queue == PQ_LAUNDRY \|\| queue == PQ_UNSWAPPABLE)
	- pq = &vm_dom[0].vmd_pagequeues[queue];
	- else
	- pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
	+ pq = &vm_pagequeue_domain(m)->vmd_pagequeues[queue];
	vm_pagequeue_lock(pq);
	m->queue = queue;
	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
	@@ -2889,7 +3038,7 @@
	}

	/*
	- * vm_page_free_wakeup:
	+ * vm_domain_free_wakeup:
	*
	* Helper routine for vm_page_free_toq(). This routine is called
	* when a page is added to the free queues.
	@@ -2897,28 +3046,39 @@
	* The page queues must be locked.
	*/
	static void
	-vm_page_free_wakeup(void)
	+vm_domain_free_wakeup(struct vm_domain *vmd)
	{

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	+ vm_domain_free_assert_locked(vmd);
	+
	/*
	* if pageout daemon needs pages, then tell it that there are
	* some free.
	*/
	- if (vm_pageout_pages_needed &&
	- vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) {
	- wakeup(&vm_pageout_pages_needed);
	- vm_pageout_pages_needed = 0;
	+ if (vmd->vmd_pageout_pages_needed &&
	+ vmd->vmd_free_count >= vmd->vmd_pageout_free_min) {
	+ wakeup(&vmd->vmd_pageout_pages_needed);
	+ vmd->vmd_pageout_pages_needed = 0;
	}
	/*
	* wakeup processes that are waiting on memory if we hit a
	* high water mark. And wakeup scheduler process if we have
	* lots of memory. this process will swapin processes.
	*/
	- if (vm_pages_needed && !vm_page_count_min()) {
	- vm_pages_needed = false;
	- wakeup(&vm_cnt.v_free_count);
	+ if (vmd->vmd_pages_needed && !vm_paging_min(vmd)) {
	+ vmd->vmd_pages_needed = false;
	+ wakeup(&vmd->vmd_free_count);
	}
	+ if ((vmd->vmd_minset && !vm_paging_min(vmd)) \|\|
	+ (vmd->vmd_severeset && !vm_paging_severe(vmd)))
	+ vm_domain_clear(vmd);
	+
	+ /* See comments in vm_wait(); */
	+ if (vm_pageproc_waiters) {
	+ vm_pageproc_waiters = 0;
	+ wakeup(&vm_pageproc_waiters);
	+ }
	+
	}

	/*
	@@ -3008,9 +3168,9 @@
	vm_page_free_phys(vm_page_t m)
	{

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	+ vm_domain_free_assert_locked(vm_pagequeue_domain(m));

	- vm_phys_freecnt_adj(m, 1);
	+ vm_domain_freecnt_adj(vm_pagequeue_domain(m), 1);
	#if VM_NRESERVLEVEL > 0
	if (!vm_reserv_free_page(m))
	#endif
	@@ -3020,15 +3180,27 @@
	void
	vm_page_free_phys_pglist(struct pglist *tq)
	{
	+ struct vm_domain *vmd;
	vm_page_t m;

	if (TAILQ_EMPTY(tq))
	return;
	- mtx_lock(&vm_page_queue_free_mtx);
	- TAILQ_FOREACH(m, tq, listq)
	+ vmd = NULL;
	+ TAILQ_FOREACH(m, tq, listq) {
	+ if (vmd != vm_pagequeue_domain(m)) {
	+ if (vmd != NULL) {
	+ vm_domain_free_wakeup(vmd);
	+ vm_domain_free_unlock(vmd);
	+ }
	+ vmd = vm_pagequeue_domain(m);
	+ vm_domain_free_lock(vmd);
	+ }
	vm_page_free_phys(m);
	- vm_page_free_wakeup();
	- mtx_unlock(&vm_page_queue_free_mtx);
	+ }
	+ if (vmd != NULL) {
	+ vm_domain_free_wakeup(vmd);
	+ vm_domain_free_unlock(vmd);
	+ }
	}

	/*
	@@ -3043,13 +3215,15 @@
	void
	vm_page_free_toq(vm_page_t m)
	{
	+ struct vm_domain *vmd;

	if (!vm_page_free_prep(m, false))
	return;
	- mtx_lock(&vm_page_queue_free_mtx);
	+ vmd = vm_pagequeue_domain(m);
	+ vm_domain_free_lock(vmd);
	vm_page_free_phys(m);
	- vm_page_free_wakeup();
	- mtx_unlock(&vm_page_queue_free_mtx);
	+ vm_domain_free_wakeup(vmd);
	+ vm_domain_free_unlock(vmd);
	}

	/*
	@@ -3160,7 +3334,7 @@
	if ((queue = m->queue) == PQ_INACTIVE && !noreuse)
	return;
	if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
	- pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE];
	+ pq = &vm_pagequeue_domain(m)->vmd_pagequeues[PQ_INACTIVE];
	/* Avoid multiple acquisitions of the inactive queue lock. */
	if (queue == PQ_INACTIVE) {
	vm_pagequeue_lock(pq);
	@@ -3172,8 +3346,9 @@
	}
	m->queue = PQ_INACTIVE;
	if (noreuse)
	- TAILQ_INSERT_BEFORE(&vm_phys_domain(m)->vmd_inacthead,
	- m, plinks.q);
	+ TAILQ_INSERT_BEFORE(
	+ &vm_pagequeue_domain(m)->vmd_inacthead, m,
	+ plinks.q);
	else
	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
	vm_pagequeue_cnt_inc(pq);
	@@ -3950,10 +4125,10 @@
	DB_SHOW_COMMAND(page, vm_page_print_page_info)
	{

	- db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count);
	- db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count);
	- db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count);
	- db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count);
	+ db_printf("vm_cnt.v_free_count: %d\n", vm_free_count());
	+ db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count());
	+ db_printf("vm_cnt.v_active_count: %d\n", vm_active_count());
	+ db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count());
	db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count);
	db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved);
	db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min);
	@@ -3965,7 +4140,7 @@
	{
	int dom;

	- db_printf("pq_free %d\n", vm_cnt.v_free_count);
	+ db_printf("pq_free %d\n", vm_free_count());
	for (dom = 0; dom < vm_ndomains; dom++) {
	db_printf(
	"dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n",
	Index: sys/vm/vm_pageout.h
	===================================================================
	--- sys/vm/vm_pageout.h
	+++ sys/vm/vm_pageout.h
	@@ -74,9 +74,7 @@
	*/

	extern int vm_page_max_wired;
	-extern int vm_pageout_deficit;
	extern int vm_pageout_page_count;
	-extern bool vm_pages_needed;

	#define VM_OOM_MEM 1
	#define VM_OOM_SWAPZ 2
	@@ -95,12 +93,15 @@
	* Signal pageout-daemon and wait for it.
	*/

	-void pagedaemon_wait(int pri, const char *wmesg);
	-void pagedaemon_wakeup(void);
	+void pagedaemon_wait(int domain, int pri, const char *wmesg);
	+void pagedaemon_wakeup(int domain);
	#define VM_WAIT vm_wait()
	#define VM_WAITPFAULT vm_waitpfault()
	void vm_wait(void);
	void vm_waitpfault(void);
	+void vm_wait_domain(int domain);
	+void vm_wait_min(void);
	+void vm_wait_severe(void);

	#ifdef _KERNEL
	int vm_pageout_flush(vm_page_t , int, int, int, int , boolean_t *);
	Index: sys/vm/vm_pageout.c
	===================================================================
	--- sys/vm/vm_pageout.c
	+++ sys/vm/vm_pageout.c
	@@ -110,6 +110,7 @@
	#include <vm/vm_pageout.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_phys.h>
	+#include <vm/vm_pagequeue.h>
	#include <vm/swap_pager.h>
	#include <vm/vm_extern.h>
	#include <vm/uma.h>
	@@ -147,20 +148,8 @@
	#define VM_LAUNDER_RATE 10
	#define VM_INACT_SCAN_RATE 2

	-int vm_pageout_deficit; /* Estimated number of pages deficit */
	-u_int vm_pageout_wakeup_thresh;
	static int vm_pageout_oom_seq = 12;
	-static bool vm_pageout_wanted; /* Event on which pageout daemon sleeps */
	-bool vm_pages_needed; /* Are threads waiting for free pages? */

	-/* Pending request for dirty page laundering. */
	-static enum {
	- VM_LAUNDRY_IDLE,
	- VM_LAUNDRY_BACKGROUND,
	- VM_LAUNDRY_SHORTFALL
	-} vm_laundry_request = VM_LAUNDRY_IDLE;
	-static int vm_inactq_scans;
	-
	static int vm_pageout_update_period;
	static int disable_swap_pageouts;
	static int lowmem_period = 10;
	@@ -173,10 +162,6 @@
	CTLFLAG_RWTUN, &vm_panic_on_oom, 0,
	"panic on out of memory instead of killing the largest process");

	-SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh,
	- CTLFLAG_RWTUN, &vm_pageout_wakeup_thresh, 0,
	- "free page threshold for waking up the pageout daemon");
	-
	SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
	CTLFLAG_RWTUN, &vm_pageout_update_period, 0,
	"Maximum active LRU update period");
	@@ -200,11 +185,6 @@
	&act_scan_laundry_weight, 0,
	"weight given to clean vs. dirty pages in active queue scans");

	-static u_int vm_background_launder_target;
	-SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RWTUN,
	- &vm_background_launder_target, 0,
	- "background laundering target, in pages");
	-
	static u_int vm_background_launder_rate = 4096;
	SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN,
	&vm_background_launder_rate, 0,
	@@ -959,18 +939,18 @@
	static void
	vm_pageout_laundry_worker(void *arg)
	{
	- struct vm_domain *domain;
	+ struct vm_domain *vmd;
	struct vm_pagequeue *pq;
	uint64_t nclean, ndirty;
	u_int inactq_scans, last_launder;
	- int domidx, last_target, launder, shortfall, shortfall_cycle, target;
	+ int domain, last_target, launder, shortfall, shortfall_cycle, target;
	bool in_shortfall;

	- domidx = (uintptr_t)arg;
	- domain = &vm_dom[domidx];
	- pq = &domain->vmd_pagequeues[PQ_LAUNDRY];
	- KASSERT(domain->vmd_segs != 0, ("domain without segments"));
	- vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY);
	+ domain = (uintptr_t)arg;
	+ vmd = VM_DOMAIN(domain);
	+ pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
	+ KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
	+ vm_pageout_init_marker(&vmd->vmd_laundry_marker, PQ_LAUNDRY);

	shortfall = 0;
	in_shortfall = false;
	@@ -982,9 +962,9 @@
	/*
	* Calls to these handlers are serialized by the swap syscall lock.
	*/
	- (void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, domain,
	+ (void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, vmd,
	EVENTHANDLER_PRI_ANY);
	- (void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, domain,
	+ (void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, vmd,
	EVENTHANDLER_PRI_ANY);

	/*
	@@ -1006,7 +986,7 @@
	target = shortfall;
	} else if (!in_shortfall)
	goto trybackground;
	- else if (shortfall_cycle == 0 \|\| vm_laundry_target() <= 0) {
	+ else if (shortfall_cycle == 0 \|\| vm_laundry_target(vmd) <= 0) {
	/*
	* We recently entered shortfall and began laundering
	* pages. If we have completed that laundering run
	@@ -1040,11 +1020,12 @@
	* memory pressure required to trigger laundering decreases.
	*/
	trybackground:
	- nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count;
	- ndirty = vm_cnt.v_laundry_count;
	+ nclean = vmd->vmd_free_count +
	+ vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt;
	+ ndirty = vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt;
	if (target == 0 && inactq_scans != last_launder &&
	ndirty * isqrt(inactq_scans - last_launder) >= nclean) {
	- target = vm_background_launder_target;
	+ target = vmd->vmd_background_launder_target;
	}

	/*
	@@ -1076,7 +1057,7 @@
	* pages could exceed "target" by the maximum size of
	* a cluster minus one.
	*/
	- target -= min(vm_pageout_launder(domain, launder,
	+ target -= min(vm_pageout_launder(vmd, launder,
	in_shortfall), target);
	pause("laundp", hz / VM_LAUNDER_RATE);
	}
	@@ -1087,8 +1068,8 @@
	* kicks us.
	*/
	vm_pagequeue_lock(pq);
	- if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE)
	- (void)mtx_sleep(&vm_laundry_request,
	+ if (target == 0 && vmd->vmd_laundry_request == VM_LAUNDRY_IDLE)
	+ (void)mtx_sleep(&vmd->vmd_laundry_request,
	vm_pagequeue_lockptr(pq), PVM, "launds", 0);

	/*
	@@ -1096,16 +1077,17 @@
	* a shortfall laundering unless we're already in the middle of
	* one. This may preempt a background laundering.
	*/
	- if (vm_laundry_request == VM_LAUNDRY_SHORTFALL &&
	+ if (vmd->vmd_laundry_request == VM_LAUNDRY_SHORTFALL &&
	(!in_shortfall \|\| shortfall_cycle == 0)) {
	- shortfall = vm_laundry_target() + vm_pageout_deficit;
	+ shortfall = vm_laundry_target(vmd) +
	+ vmd->vmd_pageout_deficit;
	target = 0;
	} else
	shortfall = 0;

	if (target == 0)
	- vm_laundry_request = VM_LAUNDRY_IDLE;
	- inactq_scans = vm_inactq_scans;
	+ vmd->vmd_laundry_request = VM_LAUNDRY_IDLE;
	+ inactq_scans = vmd->vmd_inactq_scans;
	vm_pagequeue_unlock(pq);
	}
	}
	@@ -1134,7 +1116,7 @@
	* If we need to reclaim memory ask kernel caches to return
	* some. We rate limit to avoid thrashing.
	*/
	- if (vmd == &vm_dom[0] && pass > 0 &&
	+ if (vmd == VM_DOMAIN(0) && pass > 0 &&
	(time_uptime - lowmem_uptime) >= lowmem_period) {
	/*
	* Decrease registered cache sizes.
	@@ -1163,8 +1145,8 @@
	* the page daemon and this calculation.
	*/
	if (pass > 0) {
	- deficit = atomic_readandclear_int(&vm_pageout_deficit);
	- page_shortage = vm_paging_target() + deficit;
	+ deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit);
	+ page_shortage = vm_paging_target(vmd) + deficit;
	} else
	page_shortage = deficit = 0;
	starting_page_shortage = page_shortage;
	@@ -1357,18 +1339,20 @@
	* keep count.
	*/
	if (starting_page_shortage > 0) {
	- pq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY];
	+ pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
	vm_pagequeue_lock(pq);
	- if (vm_laundry_request == VM_LAUNDRY_IDLE &&
	+ if (vmd->vmd_laundry_request == VM_LAUNDRY_IDLE &&
	(pq->pq_cnt > 0 \|\| atomic_load_acq_int(&swapdev_enabled))) {
	if (page_shortage > 0) {
	- vm_laundry_request = VM_LAUNDRY_SHORTFALL;
	+ vmd->vmd_laundry_request = VM_LAUNDRY_SHORTFALL;
	VM_CNT_INC(v_pdshortfalls);
	- } else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL)
	- vm_laundry_request = VM_LAUNDRY_BACKGROUND;
	- wakeup(&vm_laundry_request);
	+ } else if (vmd->vmd_laundry_request !=
	+ VM_LAUNDRY_SHORTFALL)
	+ vmd->vmd_laundry_request =
	+ VM_LAUNDRY_BACKGROUND;
	+ wakeup(&vmd->vmd_laundry_request);
	}
	- vm_inactq_scans++;
	+ vmd->vmd_inactq_scans++;
	vm_pagequeue_unlock(pq);
	}

	@@ -1397,9 +1381,9 @@
	* more aggressively, improving the effectiveness of clustering and
	* ensuring that they can eventually be reused.
	*/
	- inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count +
	- vm_cnt.v_laundry_count / act_scan_laundry_weight) +
	- vm_paging_target() + deficit + addl_page_shortage;
	+ inactq_shortage = vmd->vmd_inactive_target - (pq->pq_cnt +
	+ vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt / act_scan_laundry_weight) +
	+ vm_paging_target(vmd) + deficit + addl_page_shortage;
	inactq_shortage *= act_scan_laundry_weight;

	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
	@@ -1742,6 +1726,8 @@
	}
	sx_sunlock(&allproc_lock);
	if (bigproc != NULL) {
	+ int i;
	+
	if (vm_panic_on_oom != 0)
	panic("out of swap space");
	PROC_LOCK(bigproc);
	@@ -1749,19 +1735,20 @@
	sched_nice(bigproc, PRIO_MIN);
	_PRELE(bigproc);
	PROC_UNLOCK(bigproc);
	- wakeup(&vm_cnt.v_free_count);
	+ for (i = 0; i < vm_ndomains; i++)
	+ wakeup(&VM_DOMAIN(i)->vmd_free_count);
	}
	}

	static void
	vm_pageout_worker(void *arg)
	{
	- struct vm_domain *domain;
	- int domidx, pass;
	+ struct vm_domain *vmd;
	+ int domain, pass;
	bool target_met;

	- domidx = (uintptr_t)arg;
	- domain = &vm_dom[domidx];
	+ domain = (uintptr_t)arg;
	+ vmd = VM_DOMAIN(domain);
	pass = 0;
	target_met = true;

	@@ -1771,18 +1758,18 @@
	* is allocated.
	*/

	- KASSERT(domain->vmd_segs != 0, ("domain without segments"));
	- domain->vmd_last_active_scan = ticks;
	- vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE);
	- vm_pageout_init_marker(&domain->vmd_inacthead, PQ_INACTIVE);
	- TAILQ_INSERT_HEAD(&domain->vmd_pagequeues[PQ_INACTIVE].pq_pl,
	- &domain->vmd_inacthead, plinks.q);
	+ KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
	+ vmd->vmd_last_active_scan = ticks;
	+ vm_pageout_init_marker(&vmd->vmd_marker, PQ_INACTIVE);
	+ vm_pageout_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE);
	+ TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl,
	+ &vmd->vmd_inacthead, plinks.q);

	/*
	* The pageout daemon worker is never done, so loop forever.
	*/
	while (TRUE) {
	- mtx_lock(&vm_page_queue_free_mtx);
	+ vm_domain_free_lock(vmd);

	/*
	* Generally, after a level >= 1 scan, if there are enough
	@@ -1796,34 +1783,34 @@
	* thread will, nonetheless, wait until another page is freed
	* or this wakeup is performed.
	*/
	- if (vm_pages_needed && !vm_page_count_min()) {
	- vm_pages_needed = false;
	- wakeup(&vm_cnt.v_free_count);
	+ if (vmd->vmd_pages_needed && !vm_paging_min(vmd)) {
	+ vmd->vmd_pages_needed = false;
	+ wakeup(&vmd->vmd_free_count);
	}

	/*
	- * Do not clear vm_pageout_wanted until we reach our free page
	+ * Do not clear vmd_pageout_wanted until we reach our free page
	* target. Otherwise, we may be awakened over and over again,
	* wasting CPU time.
	*/
	- if (vm_pageout_wanted && target_met)
	- vm_pageout_wanted = false;
	+ if (vmd->vmd_pageout_wanted && target_met)
	+ vmd->vmd_pageout_wanted = false;

	/*
	* Might the page daemon receive a wakeup call?
	*/
	- if (vm_pageout_wanted) {
	+ if (vmd->vmd_pageout_wanted) {
	/*
	- * No. Either vm_pageout_wanted was set by another
	+ * No. Either vmd_pageout_wanted was set by another
	* thread during the previous scan, which must have
	- * been a level 0 scan, or vm_pageout_wanted was
	+ * been a level 0 scan, or vmd_pageout_wanted was
	* already set and the scan failed to free enough
	* pages. If we haven't yet performed a level >= 1
	* (page reclamation) scan, then increase the level
	* and scan again now. Otherwise, sleep a bit and
	* try again later.
	*/
	- mtx_unlock(&vm_page_queue_free_mtx);
	+ vm_domain_free_unlock(vmd);
	if (pass >= 1)
	pause("pwait", hz / VM_INACT_SCAN_RATE);
	pass++;
	@@ -1834,20 +1821,20 @@
	* sleep until the next wakeup or until pages need to
	* have their reference stats updated.
	*/
	- if (vm_pages_needed) {
	- mtx_unlock(&vm_page_queue_free_mtx);
	+ if (vmd->vmd_pages_needed) {
	+ vm_domain_free_unlock(vmd);
	if (pass == 0)
	pass++;
	- } else if (mtx_sleep(&vm_pageout_wanted,
	- &vm_page_queue_free_mtx, PDROP \| PVM, "psleep",
	- hz) == 0) {
	+ } else if (mtx_sleep(&vmd->vmd_pageout_wanted,
	+ vm_domain_free_lockptr(vmd), PDROP \| PVM,
	+ "psleep", hz) == 0) {
	VM_CNT_INC(v_pdwakeups);
	pass = 1;
	} else
	pass = 0;
	}

	- target_met = vm_pageout_scan(domain, pass);
	+ target_met = vm_pageout_scan(vmd, pass);
	}
	}

	@@ -1855,43 +1842,78 @@
	* vm_pageout_init initialises basic pageout daemon settings.
	*/
	static void
	-vm_pageout_init(void)
	+vm_pageout_init_domain(int domain)
	{
	- /*
	- * Initialize some paging parameters.
	- */
	- vm_cnt.v_interrupt_free_min = 2;
	- if (vm_cnt.v_page_count < 2000)
	- vm_pageout_page_count = 8;
	+ struct vm_domain *vmd;

	+ vmd = VM_DOMAIN(domain);
	+ vmd->vmd_interrupt_free_min = 2;
	+
	/*
	* v_free_reserved needs to include enough for the largest
	* swap pager structures plus enough for any pv_entry structs
	* when paging.
	*/
	- if (vm_cnt.v_page_count > 1024)
	- vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200;
	+ if (vmd->vmd_page_count > 1024)
	+ vmd->vmd_free_min = 4 + (vmd->vmd_page_count - 1024) / 200;
	else
	- vm_cnt.v_free_min = 4;
	- vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
	- vm_cnt.v_interrupt_free_min;
	- vm_cnt.v_free_reserved = vm_pageout_page_count +
	- vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768);
	- vm_cnt.v_free_severe = vm_cnt.v_free_min / 2;
	- vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved;
	- vm_cnt.v_free_min += vm_cnt.v_free_reserved;
	- vm_cnt.v_free_severe += vm_cnt.v_free_reserved;
	- vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2;
	- if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3)
	- vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3;
	+ vmd->vmd_free_min = 4;
	+ vmd->vmd_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
	+ vmd->vmd_interrupt_free_min;
	+ vmd->vmd_free_reserved = vm_pageout_page_count +
	+ vmd->vmd_pageout_free_min + (vmd->vmd_page_count / 768);
	+ vmd->vmd_free_severe = vmd->vmd_free_min / 2;
	+ vmd->vmd_free_target = 4 * vmd->vmd_free_min + vmd->vmd_free_reserved;
	+ vmd->vmd_free_min += vmd->vmd_free_reserved;
	+ vmd->vmd_free_severe += vmd->vmd_free_reserved;
	+ vmd->vmd_inactive_target = (3 * vmd->vmd_free_target) / 2;
	+ if (vmd->vmd_inactive_target > vmd->vmd_free_count / 3)
	+ vmd->vmd_inactive_target = vmd->vmd_free_count / 3;

	/*
	* Set the default wakeup threshold to be 10% above the minimum
	* page limit. This keeps the steady state out of shortfall.
	*/
	- vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11;
	+ vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_min / 10) * 11;

	/*
	+ * Target amount of memory to move out of the laundry queue during a
	+ * background laundering. This is proportional to the amount of system
	+ * memory.
	+ */
	+ vmd->vmd_background_launder_target = (vmd->vmd_free_target -
	+ vmd->vmd_free_min) / 10;
	+}
	+
	+static void
	+vm_pageout_init(void)
	+{
	+ u_int freecount;
	+ int i;
	+
	+ /*
	+ * Initialize some paging parameters.
	+ */
	+ if (vm_cnt.v_page_count < 2000)
	+ vm_pageout_page_count = 8;
	+
	+ freecount = 0;
	+ for (i = 0; i < vm_ndomains; i++) {
	+ struct vm_domain *vmd;
	+
	+ vm_pageout_init_domain(i);
	+ vmd = VM_DOMAIN(i);
	+ vm_cnt.v_free_reserved += vmd->vmd_free_reserved;
	+ vm_cnt.v_free_target += vmd->vmd_free_target;
	+ vm_cnt.v_free_min += vmd->vmd_free_min;
	+ vm_cnt.v_inactive_target += vmd->vmd_inactive_target;
	+ vm_cnt.v_pageout_free_min += vmd->vmd_pageout_free_min;
	+ vm_cnt.v_interrupt_free_min += vmd->vmd_interrupt_free_min;
	+ vm_cnt.v_free_severe += vmd->vmd_free_severe;
	+ freecount += vmd->vmd_free_count;
	+ }
	+
	+ /*
	* Set interval in seconds for active scan. We want to visit each
	* page at least once every ten minutes. This is to prevent worst
	* case paging behaviors with stale active LRU.
	@@ -1899,17 +1921,8 @@
	if (vm_pageout_update_period == 0)
	vm_pageout_update_period = 600;

	- /* XXX does not really belong here */
	if (vm_page_max_wired == 0)
	- vm_page_max_wired = vm_cnt.v_free_count / 3;
	-
	- /*
	- * Target amount of memory to move out of the laundry queue during a
	- * background laundering. This is proportional to the amount of system
	- * memory.
	- */
	- vm_background_launder_target = (vm_cnt.v_free_target -
	- vm_cnt.v_free_min) / 10;
	+ vm_page_max_wired = freecount / 3;
	}

	/*
	@@ -1933,6 +1946,12 @@
	panic("starting pageout for domain %d, error %d\n",
	i, error);
	}
	+ error = kthread_add(vm_pageout_laundry_worker,
	+ (void *)(uintptr_t)i, curproc, NULL, 0, 0,
	+ "laundry: dom%d", i);
	+ if (error != 0)
	+ panic("starting laundry for domain %d, error %d",
	+ i, error);
	}
	error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL,
	0, 0, "uma");
	@@ -1945,14 +1964,16 @@
	* Perform an advisory wakeup of the page daemon.
	*/
	void
	-pagedaemon_wakeup(void)
	+pagedaemon_wakeup(int domain)
	{
	+ struct vm_domain *vmd;

	- mtx_assert(&vm_page_queue_free_mtx, MA_NOTOWNED);
	+ vmd = VM_DOMAIN(domain);
	+ vm_domain_free_assert_unlocked(vmd);

	- if (!vm_pageout_wanted && curthread->td_proc != pageproc) {
	- vm_pageout_wanted = true;
	- wakeup(&vm_pageout_wanted);
	+ if (!vmd->vmd_pageout_wanted && curthread->td_proc != pageproc) {
	+ vmd->vmd_pageout_wanted = true;
	+ wakeup(&vmd->vmd_pageout_wanted);
	}
	}

	@@ -1962,22 +1983,26 @@
	* This function returns with the free queues mutex unlocked.
	*/
	void
	-pagedaemon_wait(int pri, const char *wmesg)
	+pagedaemon_wait(int domain, int pri, const char *wmesg)
	{
	+ struct vm_domain *vmd;

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	+ vmd = VM_DOMAIN(domain);
	+ vm_domain_free_assert_locked(vmd);

	/*
	- * vm_pageout_wanted may have been set by an advisory wakeup, but if the
	- * page daemon is running on a CPU, the wakeup will have been lost.
	+ * vmd_pageout_wanted may have been set by an advisory wakeup, but if
	+ * the page daemon is running on a CPU, the wakeup will have been lost.
	* Thus, deliver a potentially spurious wakeup to ensure that the page
	* daemon has been notified of the shortage.
	*/
	- if (!vm_pageout_wanted \|\| !vm_pages_needed) {
	- vm_pageout_wanted = true;
	- wakeup(&vm_pageout_wanted);
	+ if (!vmd->vmd_pageout_wanted \|\| !vmd->vmd_pages_needed) {
	+ vmd->vmd_pageout_wanted = true;
	+ wakeup(&vmd->vmd_pageout_wanted);
	}
	- vm_pages_needed = true;
	- msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP \| pri,
	+ vmd->vmd_pages_needed = true;
	+ vmd->vmd_waiters++;
	+ msleep(&vmd->vmd_free_count, vm_domain_free_lockptr(vmd), PDROP \| pri,
	wmesg, 0);
	+ vmd->vmd_waiters--;
	}
	Index: sys/vm/vm_pagequeue.h
	===================================================================
	--- sys/vm/vm_pagequeue.h
	+++ sys/vm/vm_pagequeue.h
	@@ -0,0 +1,235 @@
	+/*-
	+ * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
	+ *
	+ * Copyright (c) 1991, 1993
	+ * The Regents of the University of California. All rights reserved.
	+ *
	+ * This code is derived from software contributed to Berkeley by
	+ * The Mach Operating System project at Carnegie-Mellon University.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ * 3. Neither the name of the University nor the names of its contributors
	+ * may be used to endorse or promote products derived from this software
	+ * without specific prior written permission.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ *
	+ * from: @(#)vm_page.h 8.2 (Berkeley) 12/13/93
	+ *
	+ *
	+ * Copyright (c) 1987, 1990 Carnegie-Mellon University.
	+ * All rights reserved.
	+ *
	+ * Authors: Avadis Tevanian, Jr., Michael Wayne Young
	+ *
	+ * Permission to use, copy, modify and distribute this software and
	+ * its documentation is hereby granted, provided that both the copyright
	+ * notice and this permission notice appear in all copies of the
	+ * software, derivative works or modified versions, and any portions
	+ * thereof, and that both notices appear in supporting documentation.
	+ *
	+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	+ *
	+ * Carnegie Mellon requests users of this software to return to
	+ *
	+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	+ * School of Computer Science
	+ * Carnegie Mellon University
	+ * Pittsburgh PA 15213-3890
	+ *
	+ * any improvements or extensions that they make and grant Carnegie the
	+ * rights to redistribute these changes.
	+ *
	+ * $FreeBSD$
	+ */
	+
	+#ifndef _VM_PAGEQUEUE_
	+#define _VM_PAGEQUEUE_
	+
	+#ifdef _KERNEL
	+struct vm_pagequeue {
	+ struct mtx pq_mutex;
	+ struct pglist pq_pl;
	+ int pq_cnt;
	+ const char * const pq_name;
	+} __aligned(CACHE_LINE_SIZE);
	+
	+
	+struct vm_domain {
	+ struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
	+ struct mtx_padalign vmd_free_mtx;
	+ struct vmem *vmd_kernel_arena;
	+ u_int vmd_domain; /* Domain number. */
	+ u_int vmd_page_count;
	+ long vmd_segs; /* bitmask of the segments */
	+
	+ /* Paging control variables, locked by domain_free_mtx. */
	+ u_int vmd_free_count;
	+ boolean_t vmd_oom;
	+ int vmd_oom_seq;
	+ int vmd_last_active_scan;
	+ struct vm_page vmd_laundry_marker;
	+ struct vm_page vmd_marker; /* marker for pagedaemon private use */
	+ struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */
	+
	+ int vmd_pageout_pages_needed; /* page daemon waiting for pages? */
	+ int vmd_pageout_deficit; /* Estimated number of pages deficit */
	+ int vmd_waiters; /* Pageout waiters. */
	+ bool vmd_pages_needed; /* Are threads waiting for free pages? */
	+ bool vmd_pageout_wanted; /* pageout daemon wait channel */
	+ bool vmd_minset; /* Are we in vm_min_domains? */
	+ bool vmd_severeset; /* Are we in vm_severe_domains? */
	+ int vmd_inactq_scans;
	+ enum {
	+ VM_LAUNDRY_IDLE = 0,
	+ VM_LAUNDRY_BACKGROUND,
	+ VM_LAUNDRY_SHORTFALL
	+ } vmd_laundry_request;
	+
	+ /* Paging thresholds. */
	+ u_int vmd_background_launder_target;
	+ u_int vmd_free_reserved; /* (c) pages reserved for deadlock */
	+ u_int vmd_free_target; /* (c) pages desired free */
	+ u_int vmd_free_min; /* (c) pages desired free */
	+ u_int vmd_inactive_target; /* (c) pages desired inactive */
	+ u_int vmd_pageout_free_min; /* (c) min pages reserved for kernel */
	+ u_int vmd_pageout_wakeup_thresh;/* (c) min pages to wake pagedaemon */
	+ u_int vmd_interrupt_free_min; /* (c) reserved pages for int code */
	+ u_int vmd_free_severe; /* (c) severe page depletion point */
	+} __aligned(CACHE_LINE_SIZE);
	+
	+extern struct vm_domain vm_dom[MAXMEMDOM];
	+
	+#define VM_DOMAIN(n) (&vm_dom[(n)])
	+
	+#define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED)
	+#define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex)
	+#define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex)
	+#define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex)
	+
	+#define vm_domain_free_assert_locked(n) \
	+ mtx_assert(vm_domain_free_lockptr((n)), MA_OWNED)
	+#define vm_domain_free_assert_unlocked(n) \
	+ mtx_assert(vm_domain_free_lockptr((n)), MA_NOTOWNED)
	+#define vm_domain_free_lock(d) \
	+ mtx_lock(vm_domain_free_lockptr((d)))
	+#define vm_domain_free_lockptr(d) \
	+ (&(d)->vmd_free_mtx)
	+#define vm_domain_free_unlock(d) \
	+ mtx_unlock(vm_domain_free_lockptr((d)))
	+
	+static __inline void
	+vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend)
	+{
	+
	+#ifdef notyet
	+ vm_pagequeue_assert_locked(pq);
	+#endif
	+ pq->pq_cnt += addend;
	+}
	+#define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1)
	+#define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1)
	+
	+void vm_domain_set(struct vm_domain *vmd);
	+int vm_domain_available(struct vm_domain *vmd, int req, int npages);
	+
	+/*
	+ * vm_pagequeue_domain:
	+ *
	+ * Return the memory domain the page belongs to.
	+ */
	+static inline struct vm_domain *
	+vm_pagequeue_domain(vm_page_t m)
	+{
	+
	+ return (VM_DOMAIN(vm_phys_domain(m)));
	+}
	+
	+/*
	+ * Return the number of pages we need to free-up or cache
	+ * A positive number indicates that we do not have enough free pages.
	+ */
	+static inline int
	+vm_paging_target(struct vm_domain *vmd)
	+{
	+
	+ return (vmd->vmd_free_target - vmd->vmd_free_count);
	+}
	+
	+/*
	+ * Returns TRUE if the pagedaemon needs to be woken up.
	+ */
	+static inline int
	+vm_paging_needed(struct vm_domain *vmd, u_int free_count)
	+{
	+
	+ return (free_count < vmd->vmd_pageout_wakeup_thresh);
	+}
	+
	+/*
	+ * Returns TRUE if the domain is below the min paging target.
	+ */
	+static inline int
	+vm_paging_min(struct vm_domain *vmd)
	+{
	+
	+ return (vmd->vmd_free_min > vmd->vmd_free_count);
	+}
	+
	+/*
	+ * Returns TRUE if the domain is below the severe paging target.
	+ */
	+static inline int
	+vm_paging_severe(struct vm_domain *vmd)
	+{
	+
	+ return (vmd->vmd_free_severe > vmd->vmd_free_count);
	+}
	+
	+/*
	+ * Return the number of pages we need to launder.
	+ * A positive number indicates that we have a shortfall of clean pages.
	+ */
	+static inline int
	+vm_laundry_target(struct vm_domain *vmd)
	+{
	+
	+ return (vm_paging_target(vmd));
	+}
	+
	+static inline u_int
	+vm_domain_freecnt_adj(struct vm_domain *vmd, int adj)
	+{
	+ u_int ret;
	+
	+ vm_domain_free_assert_locked(vmd);
	+ ret = vmd->vmd_free_count += adj;
	+ if ((!vmd->vmd_minset && vm_paging_min(vmd)) \|\|
	+ (!vmd->vmd_severeset && vm_paging_severe(vmd)))
	+ vm_domain_set(vmd);
	+
	+ return (ret);
	+}
	+
	+
	+#endif /* _KERNEL */
	+#endif /* !_VM_PAGEQUEUE_ */
	Index: sys/vm/vm_phys.h
	===================================================================
	--- sys/vm/vm_phys.h
	+++ sys/vm/vm_phys.h
	@@ -96,12 +96,12 @@

	/*
	*
	- * vm_phys_domidx:
	+ * vm_phys_domain:
	*
	* Return the index of the domain the page belongs to.
	*/
	static inline int
	-vm_phys_domidx(vm_page_t m)
	+vm_phys_domain(vm_page_t m)
	{
	#ifdef NUMA
	int domn, segind;
	@@ -115,27 +115,6 @@
	#else
	return (0);
	#endif
	-}
	-
	-/*
	- * vm_phys_domain:
	- *
	- * Return the memory domain the page belongs to.
	- */
	-static inline struct vm_domain *
	-vm_phys_domain(vm_page_t m)
	-{
	-
	- return (&vm_dom[vm_phys_domidx(m)]);
	-}
	-
	-static inline u_int
	-vm_phys_freecnt_adj(vm_page_t m, int adj)
	-{
	-
	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	- vm_phys_domain(m)->vmd_free_count += adj;
	- return (vm_cnt.v_free_count += adj);
	}

	#endif /* _KERNEL */
	Index: sys/vm/vm_phys.c
	===================================================================
	--- sys/vm/vm_phys.c
	+++ sys/vm/vm_phys.c
	@@ -67,6 +67,7 @@
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_phys.h>
	+#include <vm/vm_pagequeue.h>

	_Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
	"Too many physsegs.");
	@@ -653,7 +654,7 @@
	if (flind < 0)
	return (NULL);

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	+ vm_domain_free_assert_locked(VM_DOMAIN(domain));
	fl = &vm_phys_free_queues[domain][flind][pool][0];
	for (oind = order; oind < VM_NFREEORDER; oind++) {
	m = TAILQ_FIRST(&fl[oind].pl);
	@@ -906,8 +907,8 @@
	m, m->pool));
	KASSERT(order < VM_NFREEORDER,
	("vm_phys_free_pages: order %d is out of range", order));
	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	seg = &vm_phys_segs[m->segind];
	+ vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
	if (order < VM_NFREEORDER - 1) {
	pa = VM_PAGE_TO_PHYS(m);
	do {
	@@ -945,7 +946,7 @@
	* Avoid unnecessary coalescing by freeing the pages in the largest
	* possible power-of-two-sized subsets.
	*/
	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	+ vm_domain_free_assert_locked(vm_pagequeue_domain(m));
	for (;; npages -= n) {
	/*
	* Unsigned "min" is used here so that "order" is assigned
	@@ -1051,14 +1052,13 @@
	vm_page_t m_set, m_tmp;
	int order;

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	-
	/*
	* First, find the contiguous, power of two-sized set of free
	* physical pages containing the given physical page "m" and
	* assign it to "m_set".
	*/
	seg = &vm_phys_segs[m->segind];
	+ vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
	order < VM_NFREEORDER - 1; ) {
	order++;
	@@ -1122,7 +1122,7 @@
	KASSERT(npages > 0, ("npages is 0"));
	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	+ vm_domain_free_assert_locked(VM_DOMAIN(domain));
	if (low >= high)
	return (NULL);
	m_run = NULL;
	@@ -1167,7 +1167,7 @@
	KASSERT(npages > 0, ("npages is 0"));
	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	+ vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
	/* Compute the queue that is the best fit for npages. */
	for (order = 0; (1 << order) < npages; order++);
	/* Search for a run satisfying the specified conditions. */
	Index: sys/vm/vm_reserv.h
	===================================================================
	--- sys/vm/vm_reserv.h
	+++ sys/vm/vm_reserv.h
	@@ -50,8 +50,14 @@
	vm_page_t vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex,
	int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
	u_long alignment, vm_paddr_t boundary, vm_page_t mpred);
	+vm_page_t vm_reserv_extend_contig(int req, vm_object_t object,
	+ vm_pindex_t pindex, int domain, u_long npages,
	+ vm_paddr_t low, vm_paddr_t high, u_long alignment,
	+ vm_paddr_t boundary, vm_page_t mpred);
	vm_page_t vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex,
	int domain, vm_page_t mpred);
	+vm_page_t vm_reserv_extend(int req, vm_object_t object,
	+ vm_pindex_t pindex, int domain, vm_page_t mpred);
	void vm_reserv_break_all(vm_object_t object);
	boolean_t vm_reserv_free_page(vm_page_t m);
	void vm_reserv_init(void);
	Index: sys/vm/vm_reserv.c
	===================================================================
	--- sys/vm/vm_reserv.c
	+++ sys/vm/vm_reserv.c
	@@ -59,7 +59,9 @@
	#include <vm/vm_param.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	+#include <vm/vm_pageout.h>
	#include <vm/vm_phys.h>
	+#include <vm/vm_pagequeue.h>
	#include <vm/vm_radix.h>
	#include <vm/vm_reserv.h>

	@@ -163,17 +165,21 @@
	* object's list of reservations.
	*
	* A partially populated reservation can be broken and reclaimed at any time.
	+ *
	+ * f - vm_domain_free_lock
	+ * o - vm_reserv_object_lock
	+ * c - constant after boot
	*/
	struct vm_reserv {
	- TAILQ_ENTRY(vm_reserv) partpopq;
	- LIST_ENTRY(vm_reserv) objq;
	- vm_object_t object; /* containing object */
	- vm_pindex_t pindex; /* offset within object */
	- vm_page_t pages; /* first page of a superpage */
	- int domain; /* NUMA domain */
	- int popcnt; /* # of pages in use */
	- char inpartpopq;
	- popmap_t popmap[NPOPMAP]; /* bit vector of used pages */
	+ TAILQ_ENTRY(vm_reserv) partpopq; /* (f) per-domain queue. */
	+ LIST_ENTRY(vm_reserv) objq; /* (o, f) object queue */
	+ vm_object_t object; /* (o, f) containing object */
	+ vm_pindex_t pindex; /* (o, f) offset in object */
	+ vm_page_t pages; /* (c) first page */
	+ int domain; /* (c) NUMA domain. */
	+ int popcnt; /* (f) # of pages in use */
	+ char inpartpopq; /* (f) */
	+ popmap_t popmap[NPOPMAP]; /* (f) bit vector, used pages */
	};

	/*
	@@ -234,6 +240,25 @@
	SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
	&vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations");

	+/*
	+ * The object lock pool is used to synchronize the rvq. We can not use a
	+ * pool mutex because it is required before malloc works.
	+ *
	+ * The "hash" function could be made faster without divide and modulo.
	+ */
	+#define VM_RESERV_OBJ_LOCK_COUNT MAXCPU
	+
	+struct mtx_padalign vm_reserv_object_mtx[VM_RESERV_OBJ_LOCK_COUNT];
	+
	+#define vm_reserv_object_lock_idx(object) \
	+ (((uintptr_t)object / sizeof(*object)) % VM_RESERV_OBJ_LOCK_COUNT)
	+#define vm_reserv_object_lock_ptr(object) \
	+ &vm_reserv_object_mtx[vm_reserv_object_lock_idx((object))]
	+#define vm_reserv_object_lock(object) \
	+ mtx_lock(vm_reserv_object_lock_ptr((object)))
	+#define vm_reserv_object_unlock(object) \
	+ mtx_unlock(vm_reserv_object_lock_ptr((object)))
	+
	static void vm_reserv_break(vm_reserv_t rv, vm_page_t m);
	static void vm_reserv_depopulate(vm_reserv_t rv, int index);
	static vm_reserv_t vm_reserv_from_page(vm_page_t m);
	@@ -288,12 +313,12 @@
	for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) {
	counter = 0;
	unused_pages = 0;
	- mtx_lock(&vm_page_queue_free_mtx);
	+ vm_domain_free_lock(VM_DOMAIN(domain));
	TAILQ_FOREACH(rv, &vm_rvq_partpop[domain], partpopq) {
	counter++;
	unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt;
	}
	- mtx_unlock(&vm_page_queue_free_mtx);
	+ vm_domain_free_unlock(VM_DOMAIN(domain));
	sbuf_printf(&sbuf, "%6d, %7d, %6dK, %6d\n",
	domain, level,
	unused_pages * ((int)PAGE_SIZE / 1024), counter);
	@@ -305,6 +330,49 @@
	}

	/*
	+ * Remove a reservation from the object's objq.
	+ */
	+static void
	+vm_reserv_remove(vm_reserv_t rv)
	+{
	+ vm_object_t object;
	+
	+ KASSERT(rv->object != NULL,
	+ ("vm_reserv_remove: reserv %p is free", rv));
	+ KASSERT(!rv->inpartpopq,
	+ ("vm_reserv_remove: reserv %p's inpartpopq is TRUE", rv));
	+ object = rv->object;
	+ vm_reserv_object_lock(object);
	+ LIST_REMOVE(rv, objq);
	+ rv->object = NULL;
	+ vm_reserv_object_unlock(object);
	+}
	+
	+/*
	+ * Insert a new reservation into the object's objq.
	+ */
	+static void
	+vm_reserv_insert(vm_reserv_t rv, vm_object_t object, vm_pindex_t pindex)
	+{
	+ int i;
	+
	+ KASSERT(rv->object == NULL,
	+ ("vm_reserv_insert: reserv %p isn't free", rv));
	+ KASSERT(rv->popcnt == 0,
	+ ("vm_reserv_insert: reserv %p's popcnt is corrupted", rv));
	+ KASSERT(!rv->inpartpopq,
	+ ("vm_reserv_insert: reserv %p's inpartpopq is TRUE", rv));
	+ for (i = 0; i < NPOPMAP; i++)
	+ KASSERT(rv->popmap[i] == 0,
	+ ("vm_reserv_insert: reserv %p's popmap is corrupted", rv));
	+ vm_reserv_object_lock(object);
	+ rv->pindex = pindex;
	+ rv->object = object;
	+ LIST_INSERT_HEAD(&object->rvq, rv, objq);
	+ vm_reserv_object_unlock(object);
	+}
	+
	+/*
	* Reduces the given reservation's population count. If the population count
	* becomes zero, the reservation is destroyed. Additionally, moves the
	* reservation to the tail of the partially populated reservation queue if the
	@@ -316,7 +384,7 @@
	vm_reserv_depopulate(vm_reserv_t rv, int index)
	{

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	+ vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
	KASSERT(rv->object != NULL,
	("vm_reserv_depopulate: reserv %p is free", rv));
	KASSERT(popmap_is_set(rv->popmap, index),
	@@ -339,9 +407,7 @@
	popmap_clear(rv->popmap, index);
	rv->popcnt--;
	if (rv->popcnt == 0) {
	- LIST_REMOVE(rv, objq);
	- rv->object = NULL;
	- rv->domain = -1;
	+ vm_reserv_remove(rv);
	vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER);
	vm_reserv_freed++;
	} else {
	@@ -361,6 +427,43 @@
	}

	/*
	+ * Returns an existing reservation or NULL and initialized successor pointer.
	+ */
	+static vm_reserv_t
	+vm_reserv_from_object(vm_object_t object, vm_pindex_t pindex,
	+ vm_page_t mpred, vm_page_t *msuccp)
	+{
	+ vm_reserv_t rv;
	+ vm_page_t msucc;
	+
	+ msucc = NULL;
	+ if (mpred != NULL) {
	+ KASSERT(mpred->object == object,
	+ ("vm_reserv_from_object: object doesn't contain mpred"));
	+ KASSERT(mpred->pindex < pindex,
	+ ("vm_reserv_from_object: mpred doesn't precede pindex"));
	+ rv = vm_reserv_from_page(mpred);
	+ if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
	+ goto found;
	+ msucc = TAILQ_NEXT(mpred, listq);
	+ } else
	+ msucc = TAILQ_FIRST(&object->memq);
	+ if (msucc != NULL) {
	+ KASSERT(msucc->pindex > pindex,
	+ ("vm_reserv_from_object: msucc doesn't succeed pindex"));
	+ rv = vm_reserv_from_page(msucc);
	+ if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
	+ goto found;
	+ }
	+ rv = NULL;
	+
	+found:
	+ *msuccp = msucc;
	+
	+ return (rv);
	+}
	+
	+/*
	* Returns TRUE if the given reservation contains the given page index and
	* FALSE otherwise.
	*/
	@@ -381,7 +484,7 @@
	vm_reserv_populate(vm_reserv_t rv, int index)
	{

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	+ vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
	KASSERT(rv->object != NULL,
	("vm_reserv_populate: reserv %p is free", rv));
	KASSERT(popmap_is_clear(rv->popmap, index),
	@@ -423,6 +526,100 @@
	* The object and free page queue must be locked.
	*/
	vm_page_t
	+vm_reserv_extend_contig(int req, vm_object_t object, vm_pindex_t pindex,
	+ int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
	+ u_long alignment, vm_paddr_t boundary, vm_page_t mpred)
	+{
	+ struct vm_domain *vmd;
	+ vm_paddr_t pa, size;
	+ vm_page_t m, msucc;
	+ vm_reserv_t rv;
	+ int i, index;
	+
	+ VM_OBJECT_ASSERT_WLOCKED(object);
	+ KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
	+
	+ /*
	+ * Is a reservation fundamentally impossible?
	+ */
	+ if (pindex < VM_RESERV_INDEX(object, pindex) \|\|
	+ pindex + npages > object->size \|\| object->resident_page_count == 0)
	+ return (NULL);
	+
	+ /*
	+ * All reservations of a particular size have the same alignment.
	+ * Assuming that the first page is allocated from a reservation, the
	+ * least significant bits of its physical address can be determined
	+ * from its offset from the beginning of the reservation and the size
	+ * of the reservation.
	+ *
	+ * Could the specified index within a reservation of the smallest
	+ * possible size satisfy the alignment and boundary requirements?
	+ */
	+ pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
	+ if ((pa & (alignment - 1)) != 0)
	+ return (NULL);
	+ size = npages << PAGE_SHIFT;
	+ if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
	+ return (NULL);
	+
	+ /*
	+ * Look for an existing reservation.
	+ */
	+ rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
	+ if (rv == NULL)
	+ return (NULL);
	+ KASSERT(object != kernel_object \|\| rv->domain == domain,
	+ ("vm_reserv_extend_contig: Domain mismatch from reservation."));
	+ index = VM_RESERV_INDEX(object, pindex);
	+ /* Does the allocation fit within the reservation? */
	+ if (index + npages > VM_LEVEL_0_NPAGES)
	+ return (NULL);
	+ domain = rv->domain;
	+ vmd = VM_DOMAIN(domain);
	+ vm_domain_free_lock(vmd);
	+ if (rv->object != object \|\| !vm_domain_available(vmd, req, npages)) {
	+ m = NULL;
	+ goto out;
	+ }
	+ m = &rv->pages[index];
	+ pa = VM_PAGE_TO_PHYS(m);
	+ if (pa < low \|\| pa + size > high \|\| (pa & (alignment - 1)) != 0 \|\|
	+ ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) {
	+ m = NULL;
	+ goto out;
	+ }
	+ /* Handle vm_page_rename(m, new_object, ...). */
	+ for (i = 0; i < npages; i++) {
	+ if (popmap_is_set(rv->popmap, index + i)) {
	+ m = NULL;
	+ goto out;
	+ }
	+ }
	+ for (i = 0; i < npages; i++)
	+ vm_reserv_populate(rv, index + i);
	+ vm_domain_freecnt_adj(vmd, -npages);
	+out:
	+ vm_domain_free_unlock(vmd);
	+ return (m);
	+}
	+
	+/*
	+ * Allocates a contiguous set of physical pages of the given size "npages"
	+ * from existing or newly created reservations. All of the physical pages
	+ * must be at or above the given physical address "low" and below the given
	+ * physical address "high". The given value "alignment" determines the
	+ * alignment of the first physical page in the set. If the given value
	+ * "boundary" is non-zero, then the set of physical pages cannot cross any
	+ * physical address boundary that is a multiple of that value. Both
	+ * "alignment" and "boundary" must be a power of two.
	+ *
	+ * The page "mpred" must immediately precede the offset "pindex" within the
	+ * specified object.
	+ *
	+ * The object and free page queue must be locked.
	+ */
	+vm_page_t
	vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, int domain,
	u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
	vm_paddr_t boundary, vm_page_t mpred)
	@@ -434,7 +631,7 @@
	u_long allocpages, maxpages, minpages;
	int i, index, n;

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	+ vm_domain_free_assert_locked(VM_DOMAIN(domain));
	VM_OBJECT_ASSERT_WLOCKED(object);
	KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));

	@@ -463,52 +660,48 @@
	return (NULL);

	/*
	- * Look for an existing reservation.
	+ * Callers should've extended an existing reservation prior to
	+ * calling this function. If a reservation exists it is
	+ * incompatible with the allocation.
	*/
	- if (mpred != NULL) {
	- KASSERT(mpred->object == object,
	- ("vm_reserv_alloc_contig: object doesn't contain mpred"));
	- KASSERT(mpred->pindex < pindex,
	- ("vm_reserv_alloc_contig: mpred doesn't precede pindex"));
	- rv = vm_reserv_from_page(mpred);
	- if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
	- goto found;
	- msucc = TAILQ_NEXT(mpred, listq);
	- } else
	- msucc = TAILQ_FIRST(&object->memq);
	- if (msucc != NULL) {
	- KASSERT(msucc->pindex > pindex,
	- ("vm_reserv_alloc_contig: msucc doesn't succeed pindex"));
	- rv = vm_reserv_from_page(msucc);
	- if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
	- goto found;
	- }
	+ rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
	+ if (rv != NULL)
	+ return (NULL);

	/*
	* Could at least one reservation fit between the first index to the
	* left that can be used ("leftcap") and the first index to the right
	* that cannot be used ("rightcap")?
	+ *
	+ * We must synchronize with the reserv object lock to protect the
	+ * pindex/object of the resulting reservations against rename while
	+ * we are inspecting.
	*/
	first = pindex - VM_RESERV_INDEX(object, pindex);
	+ minpages = VM_RESERV_INDEX(object, pindex) + npages;
	+ maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES);
	+ allocpages = maxpages;
	+ vm_reserv_object_lock(object);
	if (mpred != NULL) {
	if ((rv = vm_reserv_from_page(mpred))->object != object)
	leftcap = mpred->pindex + 1;
	else
	leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
	- if (leftcap > first)
	+ if (leftcap > first) {
	+ vm_reserv_object_unlock(object);
	return (NULL);
	+ }
	}
	- minpages = VM_RESERV_INDEX(object, pindex) + npages;
	- maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES);
	- allocpages = maxpages;
	if (msucc != NULL) {
	if ((rv = vm_reserv_from_page(msucc))->object != object)
	rightcap = msucc->pindex;
	else
	rightcap = rv->pindex;
	if (first + maxpages > rightcap) {
	- if (maxpages == VM_LEVEL_0_NPAGES)
	+ if (maxpages == VM_LEVEL_0_NPAGES) {
	+ vm_reserv_object_unlock(object);
	return (NULL);
	+ }

	/*
	* At least one reservation will fit between "leftcap"
	@@ -519,6 +712,7 @@
	allocpages = minpages;
	}
	}
	+ vm_reserv_object_unlock(object);

	/*
	* Would the last new reservation extend past the end of the object?
	@@ -549,7 +743,7 @@
	VM_LEVEL_0_SIZE), boundary > VM_LEVEL_0_SIZE ? boundary : 0);
	if (m == NULL)
	return (NULL);
	- KASSERT(vm_phys_domidx(m) == domain,
	+ KASSERT(vm_phys_domain(m) == domain,
	("vm_reserv_alloc_contig: Page domain does not match requested."));

	/*
	@@ -565,22 +759,7 @@
	KASSERT(rv->pages == m,
	("vm_reserv_alloc_contig: reserv %p's pages is corrupted",
	rv));
	- KASSERT(rv->object == NULL,
	- ("vm_reserv_alloc_contig: reserv %p isn't free", rv));
	- LIST_INSERT_HEAD(&object->rvq, rv, objq);
	- rv->object = object;
	- rv->pindex = first;
	- rv->domain = domain;
	- KASSERT(rv->popcnt == 0,
	- ("vm_reserv_alloc_contig: reserv %p's popcnt is corrupted",
	- rv));
	- KASSERT(!rv->inpartpopq,
	- ("vm_reserv_alloc_contig: reserv %p's inpartpopq is TRUE",
	- rv));
	- for (i = 0; i < NPOPMAP; i++)
	- KASSERT(rv->popmap[i] == 0,
	- ("vm_reserv_alloc_contig: reserv %p's popmap is corrupted",
	- rv));
	+ vm_reserv_insert(rv, object, first);
	n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
	for (i = 0; i < n; i++)
	vm_reserv_populate(rv, index + i);
	@@ -594,31 +773,68 @@
	allocpages -= VM_LEVEL_0_NPAGES;
	} while (allocpages >= VM_LEVEL_0_NPAGES);
	return (m_ret);
	+}

	+/*
	+ * Attempts to extend an existing reservation and allocate the page to the
	+ * object.
	+ *
	+ * The page "mpred" must immediately precede the offset "pindex" within the
	+ * specified object.
	+ *
	+ * The object must be locked.
	+ */
	+vm_page_t
	+vm_reserv_extend(int req, vm_object_t object, vm_pindex_t pindex, int domain,
	+ vm_page_t mpred)
	+{
	+ struct vm_domain *vmd;
	+ vm_page_t m, msucc;
	+ vm_reserv_t rv;
	+ int index, free_count;
	+
	+ VM_OBJECT_ASSERT_WLOCKED(object);
	+
	/*
	- * Found a matching reservation.
	+ * Could a reservation currently exist?
	*/
	-found:
	- index = VM_RESERV_INDEX(object, pindex);
	- /* Does the allocation fit within the reservation? */
	- if (index + npages > VM_LEVEL_0_NPAGES)
	+ if (pindex < VM_RESERV_INDEX(object, pindex) \|\|
	+ pindex >= object->size \|\| object->resident_page_count == 0)
	return (NULL);
	- m = &rv->pages[index];
	- pa = VM_PAGE_TO_PHYS(m);
	- if (pa < low \|\| pa + size > high \|\| (pa & (alignment - 1)) != 0 \|\|
	- ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
	+
	+ /*
	+ * Look for an existing reservation.
	+ */
	+ rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
	+ if (rv == NULL)
	return (NULL);
	- /* Handle vm_page_rename(m, new_object, ...). */
	- for (i = 0; i < npages; i++)
	- if (popmap_is_set(rv->popmap, index + i))
	- return (NULL);
	- for (i = 0; i < npages; i++)
	- vm_reserv_populate(rv, index + i);
	+
	+ KASSERT(object != kernel_object \|\| rv->domain == domain,
	+ ("vm_reserv_extend: Domain mismatch from reservation."));
	+ domain = rv->domain;
	+ vmd = VM_DOMAIN(domain);
	+ index = VM_RESERV_INDEX(object, pindex);
	+ m = &rv->pages[index];
	+ vm_domain_free_lock(vmd);
	+ if (vm_domain_available(vmd, req, 1) == 0 \|\|
	+ /* Handle reclaim race. */
	+ rv->object != object \|\|
	+ /* Handle vm_page_rename(m, new_object, ...). */
	+ popmap_is_set(rv->popmap, index))
	+ m = NULL;
	+ if (m != NULL)
	+ vm_reserv_populate(rv, index);
	+ free_count = vm_domain_freecnt_adj(vmd, -1);
	+ vm_domain_free_unlock(vmd);
	+
	+ if (vm_paging_needed(vmd, free_count))
	+ pagedaemon_wakeup(domain);
	+
	return (m);
	}

	/*
	- * Allocates a page from an existing or newly created reservation.
	+ * Allocates a page from an existing reservation.
	*
	* The page "mpred" must immediately precede the offset "pindex" within the
	* specified object.
	@@ -632,9 +848,9 @@
	vm_page_t m, msucc;
	vm_pindex_t first, leftcap, rightcap;
	vm_reserv_t rv;
	- int i, index;
	+ int index;

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	+ vm_domain_free_assert_locked(VM_DOMAIN(domain));
	VM_OBJECT_ASSERT_WLOCKED(object);

	/*
	@@ -645,48 +861,45 @@
	return (NULL);

	/*
	- * Look for an existing reservation.
	+ * Callers should've extended an existing reservation prior to
	+ * calling this function. If a reservation exists it is
	+ * incompatible with the allocation.
	*/
	- if (mpred != NULL) {
	- KASSERT(mpred->object == object,
	- ("vm_reserv_alloc_page: object doesn't contain mpred"));
	- KASSERT(mpred->pindex < pindex,
	- ("vm_reserv_alloc_page: mpred doesn't precede pindex"));
	- rv = vm_reserv_from_page(mpred);
	- if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
	- goto found;
	- msucc = TAILQ_NEXT(mpred, listq);
	- } else
	- msucc = TAILQ_FIRST(&object->memq);
	- if (msucc != NULL) {
	- KASSERT(msucc->pindex > pindex,
	- ("vm_reserv_alloc_page: msucc doesn't succeed pindex"));
	- rv = vm_reserv_from_page(msucc);
	- if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
	- goto found;
	- }
	+ rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
	+ if (rv != NULL)
	+ return (NULL);

	/*
	* Could a reservation fit between the first index to the left that
	* can be used and the first index to the right that cannot be used?
	+ *
	+ * We must synchronize with the reserv object lock to protect the
	+ * pindex/object of the resulting reservations against rename while
	+ * we are inspecting.
	*/
	first = pindex - VM_RESERV_INDEX(object, pindex);
	+ vm_reserv_object_lock(object);
	if (mpred != NULL) {
	if ((rv = vm_reserv_from_page(mpred))->object != object)
	leftcap = mpred->pindex + 1;
	else
	leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
	- if (leftcap > first)
	+ if (leftcap > first) {
	+ vm_reserv_object_unlock(object);
	return (NULL);
	+ }
	}
	if (msucc != NULL) {
	if ((rv = vm_reserv_from_page(msucc))->object != object)
	rightcap = msucc->pindex;
	else
	rightcap = rv->pindex;
	- if (first + VM_LEVEL_0_NPAGES > rightcap)
	+ if (first + VM_LEVEL_0_NPAGES > rightcap) {
	+ vm_reserv_object_unlock(object);
	return (NULL);
	+ }
	}
	+ vm_reserv_object_unlock(object);

	/*
	* Would a new reservation extend past the end of the object?
	@@ -712,37 +925,10 @@
	rv = vm_reserv_from_page(m);
	KASSERT(rv->pages == m,
	("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv));
	- KASSERT(rv->object == NULL,
	- ("vm_reserv_alloc_page: reserv %p isn't free", rv));
	- LIST_INSERT_HEAD(&object->rvq, rv, objq);
	- rv->object = object;
	- rv->pindex = first;
	- rv->domain = domain;
	- KASSERT(rv->popcnt == 0,
	- ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted", rv));
	- KASSERT(!rv->inpartpopq,
	- ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE", rv));
	- for (i = 0; i < NPOPMAP; i++)
	- KASSERT(rv->popmap[i] == 0,
	- ("vm_reserv_alloc_page: reserv %p's popmap is corrupted",
	- rv));
	+ vm_reserv_insert(rv, object, first);
	index = VM_RESERV_INDEX(object, pindex);
	vm_reserv_populate(rv, index);
	return (&rv->pages[index]);
	-
	- /*
	- * Found a matching reservation.
	- */
	-found:
	- index = VM_RESERV_INDEX(object, pindex);
	- m = &rv->pages[index];
	- KASSERT(object != kernel_object \|\| vm_phys_domidx(m) == domain,
	- ("vm_reserv_alloc_page: Domain mismatch from reservation."));
	- /* Handle vm_page_rename(m, new_object, ...). */
	- if (popmap_is_set(rv->popmap, index))
	- return (NULL);
	- vm_reserv_populate(rv, index);
	- return (m);
	}

	/*
	@@ -759,14 +945,8 @@
	{
	int begin_zeroes, hi, i, lo;

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	- KASSERT(rv->object != NULL,
	- ("vm_reserv_break: reserv %p is free", rv));
	- KASSERT(!rv->inpartpopq,
	- ("vm_reserv_break: reserv %p's inpartpopq is TRUE", rv));
	- LIST_REMOVE(rv, objq);
	- rv->object = NULL;
	- rv->domain = -1;
	+ vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
	+ vm_reserv_remove(rv);
	if (m != NULL) {
	/*
	* Since the reservation is being broken, there is no harm in
	@@ -830,9 +1010,26 @@
	vm_reserv_break_all(vm_object_t object)
	{
	vm_reserv_t rv;
	+ struct vm_domain *vmd;

	- mtx_lock(&vm_page_queue_free_mtx);
	+ /*
	+ * This access of object->rvq is unsynchronized so that the
	+ * object rvq lock can nest after the domain_free lock. We
	+ * must check for races in the results. However, the object
	+ * lock prevents new additions, so we are guaranteed that when
	+ * it returns NULL the object is properly empty.
	+ */
	+ vmd = NULL;
	while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
	+ if (vmd != VM_DOMAIN(rv->domain)) {
	+ if (vmd != NULL)
	+ vm_domain_free_unlock(vmd);
	+ vmd = VM_DOMAIN(rv->domain);
	+ vm_domain_free_lock(vmd);
	+ }
	+ /* Reclaim race. */
	+ if (rv->object != object)
	+ continue;
	KASSERT(rv->object == object,
	("vm_reserv_break_all: reserv %p is corrupted", rv));
	if (rv->inpartpopq) {
	@@ -841,7 +1038,8 @@
	}
	vm_reserv_break(rv, NULL);
	}
	- mtx_unlock(&vm_page_queue_free_mtx);
	+ if (vmd != NULL)
	+ vm_domain_free_unlock(vmd);
	}

	/*
	@@ -855,8 +1053,8 @@
	{
	vm_reserv_t rv;

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	rv = vm_reserv_from_page(m);
	+ vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
	if (rv->object == NULL)
	return (FALSE);
	vm_reserv_depopulate(rv, m - rv->pages);
	@@ -886,6 +1084,8 @@
	while (paddr + VM_LEVEL_0_SIZE <= seg->end) {
	vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages =
	PHYS_TO_VM_PAGE(paddr);
	+ vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].domain =
	+ seg->domain;
	paddr += VM_LEVEL_0_SIZE;
	}
	}
	@@ -902,8 +1102,8 @@
	{
	vm_reserv_t rv;

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	rv = vm_reserv_from_page(m);
	+ vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
	if (rv->object == NULL)
	return (false);
	return (popmap_is_clear(rv->popmap, m - rv->pages));
	@@ -945,7 +1145,7 @@
	vm_reserv_reclaim(vm_reserv_t rv)
	{

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	+ vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
	KASSERT(rv->inpartpopq,
	("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
	KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains,
	@@ -969,7 +1169,7 @@
	{
	vm_reserv_t rv;

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	+ vm_domain_free_assert_locked(VM_DOMAIN(domain));
	if ((rv = TAILQ_FIRST(&vm_rvq_partpop[domain])) != NULL) {
	vm_reserv_reclaim(rv);
	return (TRUE);
	@@ -993,7 +1193,7 @@
	vm_reserv_t rv;
	int hi, i, lo, low_index, next_free;

	- mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
	+ vm_domain_free_assert_locked(VM_DOMAIN(domain));
	if (npages > VM_LEVEL_0_NPAGES - 1)
	return (FALSE);
	size = npages << PAGE_SHIFT;
	@@ -1084,14 +1284,19 @@
	VM_OBJECT_ASSERT_WLOCKED(new_object);
	rv = vm_reserv_from_page(m);
	if (rv->object == old_object) {
	- mtx_lock(&vm_page_queue_free_mtx);
	+ vm_domain_free_lock(VM_DOMAIN(rv->domain));
	if (rv->object == old_object) {
	+ vm_reserv_object_lock(old_object);
	+ rv->object = NULL;
	LIST_REMOVE(rv, objq);
	- LIST_INSERT_HEAD(&new_object->rvq, rv, objq);
	+ vm_reserv_object_unlock(old_object);
	+ vm_reserv_object_lock(new_object);
	rv->object = new_object;
	rv->pindex -= old_object_offset;
	+ LIST_INSERT_HEAD(&new_object->rvq, rv, objq);
	+ vm_reserv_object_unlock(new_object);
	}
	- mtx_unlock(&vm_page_queue_free_mtx);
	+ vm_domain_free_unlock(VM_DOMAIN(rv->domain));
	}
	}

	@@ -1121,6 +1326,7 @@
	{
	vm_paddr_t new_end;
	size_t size;
	+ int i;

	/*
	* Calculate the size (in bytes) of the reservation array. Round up
	@@ -1139,6 +1345,10 @@
	vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end,
	VM_PROT_READ \| VM_PROT_WRITE);
	bzero(vm_reserv_array, size);
	+
	+ for (i = 0; i < VM_RESERV_OBJ_LOCK_COUNT; i++)
	+ mtx_init(&vm_reserv_object_mtx[i], "resv obj lock", NULL,
	+ MTX_DEF);

	/*
	* Return the next available physical address.
	Index: sys/vm/vm_swapout.c
	===================================================================
	--- sys/vm/vm_swapout.c
	+++ sys/vm/vm_swapout.c
	@@ -650,7 +650,7 @@

	loop:
	if (vm_page_count_min()) {
	- VM_WAIT;
	+ vm_wait_min();
	goto loop;
	}

	Index: sys/vm/vnode_pager.c
	===================================================================
	--- sys/vm/vnode_pager.c
	+++ sys/vm/vnode_pager.c
	@@ -1167,7 +1167,7 @@
	* daemon up. This should be probably be addressed XXX.
	*/

	- if (vm_cnt.v_free_count < vm_cnt.v_pageout_free_min)
	+ if (vm_page_count_min())
	flags \|= VM_PAGER_PUT_SYNC;

	/*

File Metadata

Mime Type: text/plain
Expires: Mon, Apr 13, 10:26 AM (11 h, 4 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 31358974
Default Alt Text: D14000.id.diff (104 KB)

D14000.id.diffNo OneTemporaryActions

D14000.id.diffView Options

File Metadata

Event Timeline

D14000.id.diff
No OneTemporary
Actions

D14000.id.diff
View Options