Index: head/share/man/man4/numa.4
===================================================================
--- head/share/man/man4/numa.4	(revision 339615)
+++ head/share/man/man4/numa.4	(revision 339616)
@@ -1,140 +1,150 @@
 .\" Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd July 10, 2018
+.Dd October 22, 2018
 .Dt NUMA 4
 .Os
 .Sh NAME
 .Nm NUMA
 .Nd Non-Uniform Memory Access
 .Sh SYNOPSIS
-.Cd options SMP
-.Cd options MAXMEMDOM=16
+.Cd options MAXMEMDOM
+.Cd options NUMA
 .Pp
-.In sys/cpuset.h
-.In sys/bus.h
 .Sh DESCRIPTION
 Non-Uniform Memory Access is a computer architecture design which
 involves unequal costs between processors, memory and IO devices
 in a given system.
 .Pp
 In a
 .Nm
 architecture, the latency to access specific memory or IO devices
 depends upon which processor the memory or device is attached to.
 Accessing memory local to a processor is faster than accessing memory
 that is connected to one of the other processors.
+.Fx
+implements NUMA-aware memory allocation policies.
+By default it attempts to ensure that allocations are balanced across
+each domain.
+Users may override the default domain selection policy using
+.Xr cpuset 1 .
 .Pp
 .Nm
-is enabled when the
+support is enabled when the
 .Cd NUMA
-option is used in a kernel configuration
-file and the
+option is specified in the kernel configuration file.
+Each platform defines the
 .Cd MAXMEMDOM
-option is set to a value greater than 1.
+constant, which specifies the maximum number of supported NUMA domains.
+This constant may be specified in the kernel configuration file.
+.Nm
+support can be disabled at boot time by setting the
+.Va vm.numa.disabled
+tunable to 1.
+Other values for this tunable are currently ignored.
 .Pp
 Thread and process
 .Nm
 policies are controlled with the
 .Xr cpuset_getdomain 2
 and
 .Xr cpuset_setdomain 2
 syscalls.
 The
 .Xr cpuset 1
 tool is available for starting processes with a non-default
 policy, or to change the policy of an existing thread or process.
 .Pp
 Systems with non-uniform access to I/O devices may mark those devices
 with the local VM domain identifier.
 Drivers can find out their local domain information by calling
 .Xr bus_get_domain 9 .
 .Ss MIB Variables
 The operation of
 .Nm
 is controlled and exposes information with these
 .Xr sysctl 8
 MIB variables:
 .Pp
 .Bl -tag -width indent -compact
 .It Va vm.ndomains
 The number of VM domains which have been detected.
 .Pp
 .It Va vm.phys_locality
 A table indicating the relative cost of each VM domain to each other.
 A value of 10 indicates equal cost.
 A value of -1 means the locality map is not available or no
 locality information is available.
 .Pp
 .It Va vm.phys_segs
 The map of physical memory, grouped by VM domain.
 .El
 .Sh IMPLEMENTATION NOTES
 The current
 .Nm
 implementation is VM-focused.
 The hardware
 .Nm
 domains are mapped into a contiguous, non-sparse
 VM domain space, starting from 0.
 Thus, VM domain information (for example, the domain identifier) is not
 necessarily the same as is found in the hardware specific information.
 Policy information is available in both struct thread and struct proc.
 .Sh SEE ALSO
 .Xr cpuset 1 ,
 .Xr cpuset_getaffinity 2 ,
 .Xr cpuset_setaffinity 2 ,
 .Xr bus_get_domain 9
 .Sh HISTORY
 .Nm
 first appeared in
 .Fx 9.0
 as a first-touch allocation policy with a fail-over to round-robin allocation
 and was not configurable.
 It was then modified in
 .Fx 10.0
 to implement a round-robin allocation policy and was also not configurable.
 .Pp
 The
 .Xr numa_getaffinity 2
 and
 .Xr numa_setaffinity 2
 syscalls and the
 .Xr numactl 1
 tool first appeared in
 .Fx 11.0
 and were removed in
 .Fx 12.0 .
-Current implementation appeared in
+The current implementation appeared in
 .Fx 12.0 .
 .Pp
 .Sh AUTHORS
 This manual page written by
 .An Adrian Chadd Aq Mt adrian@FreeBSD.org .
 .Sh NOTES
 No statistics are kept to indicate how often
 .Nm
 allocation policies succeed or fail.
Index: head/sys/arm64/arm64/mp_machdep.c
===================================================================
--- head/sys/arm64/arm64/mp_machdep.c	(revision 339615)
+++ head/sys/arm64/arm64/mp_machdep.c	(revision 339616)
@@ -1,894 +1,895 @@
 /*-
  * Copyright (c) 2015-2016 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Andrew Turner under
  * sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include "opt_acpi.h"
 #include "opt_kstack_pages.h"
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 
 #include <machine/machdep.h>
 #include <machine/intr.h>
 #include <machine/smp.h>
 #ifdef VFP
 #include <machine/vfp.h>
 #endif
 
 #ifdef DEV_ACPI
 #include <contrib/dev/acpica/include/acpi.h>
 #include <dev/acpica/acpivar.h>
 #endif
 
 #ifdef FDT
 #include <dev/ofw/openfirm.h>
 #include <dev/ofw/ofw_bus.h>
 #include <dev/ofw/ofw_bus_subr.h>
 #include <dev/ofw/ofw_cpu.h>
 #endif
 
 #include <dev/psci/psci.h>
 
 #include "pic_if.h"
 
 #define	MP_QUIRK_CPULIST	0x01	/* The list of cpus may be wrong, */
 					/* don't panic if one fails to start */
 static uint32_t mp_quirks;
 
 #ifdef FDT
 static struct {
 	const char *compat;
 	uint32_t quirks;
 } fdt_quirks[] = {
 	{ "arm,foundation-aarch64",	MP_QUIRK_CPULIST },
 	{ "arm,fvp-base",		MP_QUIRK_CPULIST },
 	/* This is incorrect in some DTS files */
 	{ "arm,vfp-base",		MP_QUIRK_CPULIST },
 	{ NULL, 0 },
 };
 #endif
 
 typedef void intr_ipi_send_t(void *, cpuset_t, u_int);
 typedef void intr_ipi_handler_t(void *);
 
 #define INTR_IPI_NAMELEN	(MAXCOMLEN + 1)
 struct intr_ipi {
 	intr_ipi_handler_t *	ii_handler;
 	void *			ii_handler_arg;
 	intr_ipi_send_t *	ii_send;
 	void *			ii_send_arg;
 	char			ii_name[INTR_IPI_NAMELEN];
 	u_long *		ii_count;
 };
 
 static struct intr_ipi ipi_sources[INTR_IPI_COUNT];
 
 static struct intr_ipi *intr_ipi_lookup(u_int);
 static void intr_pic_ipi_setup(u_int, const char *, intr_ipi_handler_t *,
     void *);
 
 extern struct pcpu __pcpu[];
 
 static device_identify_t arm64_cpu_identify;
 static device_probe_t arm64_cpu_probe;
 static device_attach_t arm64_cpu_attach;
 
 static void ipi_ast(void *);
 static void ipi_hardclock(void *);
 static void ipi_preempt(void *);
 static void ipi_rendezvous(void *);
 static void ipi_stop(void *);
 
 struct mtx ap_boot_mtx;
 struct pcb stoppcbs[MAXCPU];
 
 static device_t cpu_list[MAXCPU];
 
 /*
  * Not all systems boot from the first CPU in the device tree. To work around
  * this we need to find which CPU we have booted from so when we later
  * enable the secondary CPUs we skip this one.
  */
 static int cpu0 = -1;
 
 void mpentry(unsigned long cpuid);
 void init_secondary(uint64_t);
 
 uint8_t secondary_stacks[MAXCPU - 1][PAGE_SIZE * KSTACK_PAGES] __aligned(16);
 
 /* Set to 1 once we're ready to let the APs out of the pen. */
 volatile int aps_ready = 0;
 
 /* Temporary variables for init_secondary()  */
 void *dpcpu[MAXCPU - 1];
 
 static device_method_t arm64_cpu_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_identify,	arm64_cpu_identify),
 	DEVMETHOD(device_probe,		arm64_cpu_probe),
 	DEVMETHOD(device_attach,	arm64_cpu_attach),
 
 	DEVMETHOD_END
 };
 
 static devclass_t arm64_cpu_devclass;
 static driver_t arm64_cpu_driver = {
 	"arm64_cpu",
 	arm64_cpu_methods,
 	0
 };
 
 DRIVER_MODULE(arm64_cpu, cpu, arm64_cpu_driver, arm64_cpu_devclass, 0, 0);
 
 static void
 arm64_cpu_identify(driver_t *driver, device_t parent)
 {
 
 	if (device_find_child(parent, "arm64_cpu", -1) != NULL)
 		return;
 	if (BUS_ADD_CHILD(parent, 0, "arm64_cpu", -1) == NULL)
 		device_printf(parent, "add child failed\n");
 }
 
 static int
 arm64_cpu_probe(device_t dev)
 {
 	u_int cpuid;
 
 	cpuid = device_get_unit(dev);
 	if (cpuid >= MAXCPU || cpuid > mp_maxid)
 		return (EINVAL);
 
 	device_quiet(dev);
 	return (0);
 }
 
 static int
 arm64_cpu_attach(device_t dev)
 {
 	const uint32_t *reg;
 	size_t reg_size;
 	u_int cpuid;
 	int i;
 
 	cpuid = device_get_unit(dev);
 
 	if (cpuid >= MAXCPU || cpuid > mp_maxid)
 		return (EINVAL);
 	KASSERT(cpu_list[cpuid] == NULL, ("Already have cpu %u", cpuid));
 
 	reg = cpu_get_cpuid(dev, &reg_size);
 	if (reg == NULL)
 		return (EINVAL);
 
 	if (bootverbose) {
 		device_printf(dev, "register <");
 		for (i = 0; i < reg_size; i++)
 			printf("%s%x", (i == 0) ? "" : " ", reg[i]);
 		printf(">\n");
 	}
 
 	/* Set the device to start it later */
 	cpu_list[cpuid] = dev;
 
 	return (0);
 }
 
 static void
 release_aps(void *dummy __unused)
 {
 	int i, started;
 
 	/* Only release CPUs if they exist */
 	if (mp_ncpus == 1)
 		return;
 
 	intr_pic_ipi_setup(IPI_AST, "ast", ipi_ast, NULL);
 	intr_pic_ipi_setup(IPI_PREEMPT, "preempt", ipi_preempt, NULL);
 	intr_pic_ipi_setup(IPI_RENDEZVOUS, "rendezvous", ipi_rendezvous, NULL);
 	intr_pic_ipi_setup(IPI_STOP, "stop", ipi_stop, NULL);
 	intr_pic_ipi_setup(IPI_STOP_HARD, "stop hard", ipi_stop, NULL);
 	intr_pic_ipi_setup(IPI_HARDCLOCK, "hardclock", ipi_hardclock, NULL);
 
 	atomic_store_rel_int(&aps_ready, 1);
 	/* Wake up the other CPUs */
 	__asm __volatile(
 	    "dsb ishst	\n"
 	    "sev	\n"
 	    ::: "memory");
 
 	printf("Release APs...");
 
 	started = 0;
 	for (i = 0; i < 2000; i++) {
 		if (smp_started) {
 			printf("done\n");
 			return;
 		}
 		/*
 		 * Don't time out while we are making progress. Some large
 		 * systems can take a while to start all CPUs.
 		 */
 		if (smp_cpus > started) {
 			i = 0;
 			started = smp_cpus;
 		}
 		DELAY(1000);
 	}
 
 	printf("APs not started\n");
 }
 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
 
 void
 init_secondary(uint64_t cpu)
 {
 	struct pcpu *pcpup;
 
 	pcpup = &__pcpu[cpu];
 	/*
 	 * Set the pcpu pointer with a backup in tpidr_el1 to be
 	 * loaded when entering the kernel from userland.
 	 */
 	__asm __volatile(
 	    "mov x18, %0 \n"
 	    "msr tpidr_el1, %0" :: "r"(pcpup));
 
 	/* Spin until the BSP releases the APs */
 	while (!aps_ready)
 		__asm __volatile("wfe");
 
 	/* Initialize curthread */
 	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
 	pcpup->pc_curthread = pcpup->pc_idlethread;
 	pcpup->pc_curpcb = pcpup->pc_idlethread->td_pcb;
 
 	/*
 	 * Identify current CPU. This is necessary to setup
 	 * affinity registers and to provide support for
 	 * runtime chip identification.
 	 */
 	identify_cpu();
 	install_cpu_errata();
 
 	intr_pic_init_secondary();
 
 	/* Start per-CPU event timers. */
 	cpu_initclocks_ap();
 
 #ifdef VFP
 	vfp_init();
 #endif
 
 	dbg_init();
 	pan_enable();
 
 	/* Enable interrupts */
 	intr_enable();
 
 	mtx_lock_spin(&ap_boot_mtx);
 
 	atomic_add_rel_32(&smp_cpus, 1);
 
 	if (smp_cpus == mp_ncpus) {
 		/* enable IPI's, tlb shootdown, freezes etc */
 		atomic_store_rel_int(&smp_started, 1);
 	}
 
 	mtx_unlock_spin(&ap_boot_mtx);
 
 	/* Enter the scheduler */
 	sched_throw(NULL);
 
 	panic("scheduler returned us to init_secondary");
 	/* NOTREACHED */
 }
 
 /*
  *  Send IPI thru interrupt controller.
  */
 static void
 pic_ipi_send(void *arg, cpuset_t cpus, u_int ipi)
 {
 
 	KASSERT(intr_irq_root_dev != NULL, ("%s: no root attached", __func__));
 	PIC_IPI_SEND(intr_irq_root_dev, arg, cpus, ipi);
 }
 
 /*
  *  Setup IPI handler on interrupt controller.
  *
  *  Not SMP coherent.
  */
 static void
 intr_pic_ipi_setup(u_int ipi, const char *name, intr_ipi_handler_t *hand,
     void *arg)
 {
 	struct intr_irqsrc *isrc;
 	struct intr_ipi *ii;
 	int error;
 
 	KASSERT(intr_irq_root_dev != NULL, ("%s: no root attached", __func__));
 	KASSERT(hand != NULL, ("%s: ipi %u no handler", __func__, ipi));
 
 	error = PIC_IPI_SETUP(intr_irq_root_dev, ipi, &isrc);
 	if (error != 0)
 		return;
 
 	isrc->isrc_handlers++;
 
 	ii = intr_ipi_lookup(ipi);
 	KASSERT(ii->ii_count == NULL, ("%s: ipi %u reused", __func__, ipi));
 
 	ii->ii_handler = hand;
 	ii->ii_handler_arg = arg;
 	ii->ii_send = pic_ipi_send;
 	ii->ii_send_arg = isrc;
 	strlcpy(ii->ii_name, name, INTR_IPI_NAMELEN);
 	ii->ii_count = intr_ipi_setup_counters(name);
 }
 
 static void
 intr_ipi_send(cpuset_t cpus, u_int ipi)
 {
 	struct intr_ipi *ii;
 
 	ii = intr_ipi_lookup(ipi);
 	if (ii->ii_count == NULL)
 		panic("%s: not setup IPI %u", __func__, ipi);
 
 	ii->ii_send(ii->ii_send_arg, cpus, ipi);
 }
 
 static void
 ipi_ast(void *dummy __unused)
 {
 
 	CTR0(KTR_SMP, "IPI_AST");
 }
 
 static void
 ipi_hardclock(void *dummy __unused)
 {
 
 	CTR1(KTR_SMP, "%s: IPI_HARDCLOCK", __func__);
 	hardclockintr();
 }
 
 static void
 ipi_preempt(void *dummy __unused)
 {
 	CTR1(KTR_SMP, "%s: IPI_PREEMPT", __func__);
 	sched_preempt(curthread);
 }
 
 static void
 ipi_rendezvous(void *dummy __unused)
 {
 
 	CTR0(KTR_SMP, "IPI_RENDEZVOUS");
 	smp_rendezvous_action();
 }
 
 static void
 ipi_stop(void *dummy __unused)
 {
 	u_int cpu;
 
 	CTR0(KTR_SMP, "IPI_STOP");
 
 	cpu = PCPU_GET(cpuid);
 	savectx(&stoppcbs[cpu]);
 
 	/* Indicate we are stopped */
 	CPU_SET_ATOMIC(cpu, &stopped_cpus);
 
 	/* Wait for restart */
 	while (!CPU_ISSET(cpu, &started_cpus))
 		cpu_spinwait();
 
 	CPU_CLR_ATOMIC(cpu, &started_cpus);
 	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
 	CTR0(KTR_SMP, "IPI_STOP (restart)");
 }
 
 struct cpu_group *
 cpu_topo(void)
 {
 
 	return (smp_topo_none());
 }
 
 /* Determine if we running MP machine */
 int
 cpu_mp_probe(void)
 {
 
 	/* ARM64TODO: Read the u bit of mpidr_el1 to determine this */
 	return (1);
 }
 
 static bool
 start_cpu(u_int id, uint64_t target_cpu)
 {
 	struct pcpu *pcpup;
 	vm_paddr_t pa;
 	u_int cpuid;
 	int err;
 
 	/* Check we are able to start this cpu */
 	if (id > mp_maxid)
 		return (false);
 
 	KASSERT(id < MAXCPU, ("Too many CPUs"));
 
 	/* We are already running on cpu 0 */
 	if (id == cpu0)
 		return (true);
 
 	/*
 	 * Rotate the CPU IDs to put the boot CPU as CPU 0. We keep the other
 	 * CPUs ordered as the are likely grouped into clusters so it can be
 	 * useful to keep that property, e.g. for the GICv3 driver to send
 	 * an IPI to all CPUs in the cluster.
 	 */
 	cpuid = id;
 	if (cpuid < cpu0)
 		cpuid += mp_maxid + 1;
 	cpuid -= cpu0;
 
 	pcpup = &__pcpu[cpuid];
 	pcpu_init(pcpup, cpuid, sizeof(struct pcpu));
 
 	dpcpu[cpuid - 1] = (void *)kmem_malloc(DPCPU_SIZE, M_WAITOK | M_ZERO);
 	dpcpu_init(dpcpu[cpuid - 1], cpuid);
 
 	printf("Starting CPU %u (%lx)\n", cpuid, target_cpu);
 	pa = pmap_extract(kernel_pmap, (vm_offset_t)mpentry);
 
 	err = psci_cpu_on(target_cpu, pa, cpuid);
 	if (err != PSCI_RETVAL_SUCCESS) {
 		/*
 		 * Panic here if INVARIANTS are enabled and PSCI failed to
 		 * start the requested CPU. If psci_cpu_on returns PSCI_MISSING
 		 * to indicate we are unable to use it to start the given CPU.
 		 */
 		KASSERT(err == PSCI_MISSING ||
 		    (mp_quirks & MP_QUIRK_CPULIST) == MP_QUIRK_CPULIST,
 		    ("Failed to start CPU %u (%lx)\n", id, target_cpu));
 
 		pcpu_destroy(pcpup);
 		kmem_free((vm_offset_t)dpcpu[cpuid - 1], DPCPU_SIZE);
 		dpcpu[cpuid - 1] = NULL;
 		mp_ncpus--;
 
 		/* Notify the user that the CPU failed to start */
 		printf("Failed to start CPU %u (%lx)\n", id, target_cpu);
 	} else
 		CPU_SET(cpuid, &all_cpus);
 
 	return (true);
 }
 
 #ifdef DEV_ACPI
 static void
 madt_handler(ACPI_SUBTABLE_HEADER *entry, void *arg)
 {
 	ACPI_MADT_GENERIC_INTERRUPT *intr;
 	u_int *cpuid;
 
 	switch(entry->Type) {
 	case ACPI_MADT_TYPE_GENERIC_INTERRUPT:
 		intr = (ACPI_MADT_GENERIC_INTERRUPT *)entry;
 		cpuid = arg;
 
 		start_cpu((*cpuid), intr->ArmMpidr);
 		(*cpuid)++;
 		break;
 	default:
 		break;
 	}
 }
 
 static void
 cpu_init_acpi(void)
 {
 	ACPI_TABLE_MADT *madt;
 	vm_paddr_t physaddr;
 	u_int cpuid;
 
 	physaddr = acpi_find_table(ACPI_SIG_MADT);
 	if (physaddr == 0)
 		return;
 
 	madt = acpi_map_table(physaddr, ACPI_SIG_MADT);
 	if (madt == NULL) {
 		printf("Unable to map the MADT, not starting APs\n");
 		return;
 	}
 
 	cpuid = 0;
 	acpi_walk_subtables(madt + 1, (char *)madt + madt->Header.Length,
 	    madt_handler, &cpuid);
 
 	acpi_unmap_table(madt);
 }
 #endif
 
 #ifdef FDT
 static boolean_t
 cpu_init_fdt(u_int id, phandle_t node, u_int addr_size, pcell_t *reg)
 {
 	uint64_t target_cpu;
 	int domain;
 
 	target_cpu = reg[0];
 	if (addr_size == 2) {
 		target_cpu <<= 32;
 		target_cpu |= reg[1];
 	}
 
 	if (!start_cpu(id, target_cpu))
 		return (FALSE);
 
 	/* Try to read the numa node of this cpu */
-	if (OF_getencprop(node, "numa-node-id", &domain, sizeof(domain)) > 0) {
-		__pcpu[id].pc_domain = domain;
-		if (domain < MAXMEMDOM)
-			CPU_SET(id, &cpuset_domain[domain]);
-	}
+	if (vm_ndomains == 1 ||
+	    OF_getencprop(node, "numa-node-id", &domain, sizeof(domain)) <= 0)
+		domain = 0;
+	__pcpu[id].pc_domain = domain;
+	if (domain < MAXMEMDOM)
+		CPU_SET(id, &cpuset_domain[domain]);
 
 	return (TRUE);
 }
 #endif
 
 /* Initialize and fire up non-boot processors */
 void
 cpu_mp_start(void)
 {
 #ifdef FDT
 	phandle_t node;
 	int i;
 #endif
 
 	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
 
 	CPU_SET(0, &all_cpus);
 
 	switch(arm64_bus_method) {
 #ifdef DEV_ACPI
 	case ARM64_BUS_ACPI:
 		KASSERT(cpu0 >= 0, ("Current CPU was not found"));
 		cpu_init_acpi();
 		break;
 #endif
 #ifdef FDT
 	case ARM64_BUS_FDT:
 		node = OF_peer(0);
 		for (i = 0; fdt_quirks[i].compat != NULL; i++) {
 			if (ofw_bus_node_is_compatible(node,
 			    fdt_quirks[i].compat) != 0) {
 				mp_quirks = fdt_quirks[i].quirks;
 			}
 		}
 		KASSERT(cpu0 >= 0, ("Current CPU was not found"));
 		ofw_cpu_early_foreach(cpu_init_fdt, true);
 		break;
 #endif
 	default:
 		break;
 	}
 }
 
 /* Introduce rest of cores to the world */
 void
 cpu_mp_announce(void)
 {
 }
 
 #ifdef DEV_ACPI
 static void
 cpu_count_acpi_handler(ACPI_SUBTABLE_HEADER *entry, void *arg)
 {
 	ACPI_MADT_GENERIC_INTERRUPT *intr;
 	u_int *cores = arg;
 	uint64_t mpidr_reg;
 
 	switch(entry->Type) {
 	case ACPI_MADT_TYPE_GENERIC_INTERRUPT:
 		intr = (ACPI_MADT_GENERIC_INTERRUPT *)entry;
 		if (cpu0 < 0) {
 			mpidr_reg = READ_SPECIALREG(mpidr_el1);
 			if ((mpidr_reg & 0xff00fffffful) == intr->ArmMpidr)
 				cpu0 = *cores;
 		}
 		(*cores)++;
 		break;
 	default:
 		break;
 	}
 }
 
 static u_int
 cpu_count_acpi(void)
 {
 	ACPI_TABLE_MADT *madt;
 	vm_paddr_t physaddr;
 	u_int cores;
 
 	physaddr = acpi_find_table(ACPI_SIG_MADT);
 	if (physaddr == 0)
 		return (0);
 
 	madt = acpi_map_table(physaddr, ACPI_SIG_MADT);
 	if (madt == NULL) {
 		printf("Unable to map the MADT, not starting APs\n");
 		return (0);
 	}
 
 	cores = 0;
 	acpi_walk_subtables(madt + 1, (char *)madt + madt->Header.Length,
 	    cpu_count_acpi_handler, &cores);
 
 	acpi_unmap_table(madt);
 
 	return (cores);
 }
 #endif
 
 #ifdef FDT
 static boolean_t
 cpu_find_cpu0_fdt(u_int id, phandle_t node, u_int addr_size, pcell_t *reg)
 {
 	uint64_t mpidr_fdt, mpidr_reg;
 
 	if (cpu0 < 0) {
 		mpidr_fdt = reg[0];
 		if (addr_size == 2) {
 			mpidr_fdt <<= 32;
 			mpidr_fdt |= reg[1];
 		}
 
 		mpidr_reg = READ_SPECIALREG(mpidr_el1);
 
 		if ((mpidr_reg & 0xff00fffffful) == mpidr_fdt)
 			cpu0 = id;
 	}
 
 	return (TRUE);
 }
 #endif
 
 void
 cpu_mp_setmaxid(void)
 {
 #if defined(DEV_ACPI) || defined(FDT)
 	int cores;
 #endif
 
 	switch(arm64_bus_method) {
 #ifdef DEV_ACPI
 	case ARM64_BUS_ACPI:
 		cores = cpu_count_acpi();
 		if (cores > 0) {
 			cores = MIN(cores, MAXCPU);
 			if (bootverbose)
 				printf("Found %d CPUs in the ACPI tables\n",
 				    cores);
 			mp_ncpus = cores;
 			mp_maxid = cores - 1;
 			return;
 		}
 		break;
 #endif
 #ifdef FDT
 	case ARM64_BUS_FDT:
 		cores = ofw_cpu_early_foreach(cpu_find_cpu0_fdt, false);
 		if (cores > 0) {
 			cores = MIN(cores, MAXCPU);
 			if (bootverbose)
 				printf("Found %d CPUs in the device tree\n",
 				    cores);
 			mp_ncpus = cores;
 			mp_maxid = cores - 1;
 			return;
 		}
 		break;
 #endif
 	default:
 		break;
 	}
 
 	if (bootverbose)
 		printf("No CPU data, limiting to 1 core\n");
 	mp_ncpus = 1;
 	mp_maxid = 0;
 }
 
 /*
  *  Lookup IPI source.
  */
 static struct intr_ipi *
 intr_ipi_lookup(u_int ipi)
 {
 
 	if (ipi >= INTR_IPI_COUNT)
 		panic("%s: no such IPI %u", __func__, ipi);
 
 	return (&ipi_sources[ipi]);
 }
 
 /*
  *  interrupt controller dispatch function for IPIs. It should
  *  be called straight from the interrupt controller, when associated
  *  interrupt source is learned. Or from anybody who has an interrupt
  *  source mapped.
  */
 void
 intr_ipi_dispatch(u_int ipi, struct trapframe *tf)
 {
 	void *arg;
 	struct intr_ipi *ii;
 
 	ii = intr_ipi_lookup(ipi);
 	if (ii->ii_count == NULL)
 		panic("%s: not setup IPI %u", __func__, ipi);
 
 	intr_ipi_increment_count(ii->ii_count, PCPU_GET(cpuid));
 
 	/*
 	 * Supply ipi filter with trapframe argument
 	 * if none is registered.
 	 */
 	arg = ii->ii_handler_arg != NULL ? ii->ii_handler_arg : tf;
 	ii->ii_handler(arg);
 }
 
 #ifdef notyet
 /*
  *  Map IPI into interrupt controller.
  *
  *  Not SMP coherent.
  */
 static int
 ipi_map(struct intr_irqsrc *isrc, u_int ipi)
 {
 	boolean_t is_percpu;
 	int error;
 
 	if (ipi >= INTR_IPI_COUNT)
 		panic("%s: no such IPI %u", __func__, ipi);
 
 	KASSERT(intr_irq_root_dev != NULL, ("%s: no root attached", __func__));
 
 	isrc->isrc_type = INTR_ISRCT_NAMESPACE;
 	isrc->isrc_nspc_type = INTR_IRQ_NSPC_IPI;
 	isrc->isrc_nspc_num = ipi_next_num;
 
 	error = PIC_REGISTER(intr_irq_root_dev, isrc, &is_percpu);
 	if (error == 0) {
 		isrc->isrc_dev = intr_irq_root_dev;
 		ipi_next_num++;
 	}
 	return (error);
 }
 
 /*
  *  Setup IPI handler to interrupt source.
  *
  *  Note that there could be more ways how to send and receive IPIs
  *  on a platform like fast interrupts for example. In that case,
  *  one can call this function with ASIF_NOALLOC flag set and then
  *  call intr_ipi_dispatch() when appropriate.
  *
  *  Not SMP coherent.
  */
 int
 intr_ipi_set_handler(u_int ipi, const char *name, intr_ipi_filter_t *filter,
     void *arg, u_int flags)
 {
 	struct intr_irqsrc *isrc;
 	int error;
 
 	if (filter == NULL)
 		return(EINVAL);
 
 	isrc = intr_ipi_lookup(ipi);
 	if (isrc->isrc_ipifilter != NULL)
 		return (EEXIST);
 
 	if ((flags & AISHF_NOALLOC) == 0) {
 		error = ipi_map(isrc, ipi);
 		if (error != 0)
 			return (error);
 	}
 
 	isrc->isrc_ipifilter = filter;
 	isrc->isrc_arg = arg;
 	isrc->isrc_handlers = 1;
 	isrc->isrc_count = intr_ipi_setup_counters(name);
 	isrc->isrc_index = 0; /* it should not be used in IPI case */
 
 	if (isrc->isrc_dev != NULL) {
 		PIC_ENABLE_INTR(isrc->isrc_dev, isrc);
 		PIC_ENABLE_SOURCE(isrc->isrc_dev, isrc);
 	}
 	return (0);
 }
 #endif
 
 /* Sending IPI */
 void
 ipi_all_but_self(u_int ipi)
 {
 	cpuset_t cpus;
 
 	cpus = all_cpus;
 	CPU_CLR(PCPU_GET(cpuid), &cpus);
 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
 	intr_ipi_send(cpus, ipi);
 }
 
 void
 ipi_cpu(int cpu, u_int ipi)
 {
 	cpuset_t cpus;
 
 	CPU_ZERO(&cpus);
 	CPU_SET(cpu, &cpus);
 
 	CTR3(KTR_SMP, "%s: cpu: %d, ipi: %x", __func__, cpu, ipi);
 	intr_ipi_send(cpus, ipi);
 }
 
 void
 ipi_selected(cpuset_t cpus, u_int ipi)
 {
 
 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
 	intr_ipi_send(cpus, ipi);
 }
Index: head/sys/kern/kern_cpuset.c
===================================================================
--- head/sys/kern/kern_cpuset.c	(revision 339615)
+++ head/sys/kern/kern_cpuset.c	(revision 339616)
@@ -1,2279 +1,2285 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  * 
  * Copyright (c) 2008 Nokia Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/ctype.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/capsicum.h>
 #include <sys/cpuset.h>
 #include <sys/domainset.h>
 #include <sys/sx.h>
 #include <sys/queue.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/vmmeter.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_pagequeue.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif /* DDB */
 
 /*
  * cpusets provide a mechanism for creating and manipulating sets of
  * processors for the purpose of constraining the scheduling of threads to
  * specific processors.
  *
  * Each process belongs to an identified set, by default this is set 1.  Each
  * thread may further restrict the cpus it may run on to a subset of this
  * named set.  This creates an anonymous set which other threads and processes
  * may not join by number.
  *
  * The named set is referred to herein as the 'base' set to avoid ambiguity.
  * This set is usually a child of a 'root' set while the anonymous set may
  * simply be referred to as a mask.  In the syscall api these are referred to
  * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
  *
  * Threads inherit their set from their creator whether it be anonymous or
  * not.  This means that anonymous sets are immutable because they may be
  * shared.  To modify an anonymous set a new set is created with the desired
  * mask and the same parent as the existing anonymous set.  This gives the
  * illusion of each thread having a private mask.
  *
  * Via the syscall apis a user may ask to retrieve or modify the root, base,
  * or mask that is discovered via a pid, tid, or setid.  Modifying a set
  * modifies all numbered and anonymous child sets to comply with the new mask.
  * Modifying a pid or tid's mask applies only to that tid but must still
  * exist within the assigned parent set.
  *
  * A thread may not be assigned to a group separate from other threads in
  * the process.  This is to remove ambiguity when the setid is queried with
  * a pid argument.  There is no other technical limitation.
  *
  * This somewhat complex arrangement is intended to make it easy for
  * applications to query available processors and bind their threads to
  * specific processors while also allowing administrators to dynamically
  * reprovision by changing sets which apply to groups of processes.
  *
  * A simple application should not concern itself with sets at all and
  * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
  * meaning 'curthread'.  It may query available cpus for that tid with a
  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
  */
 
 LIST_HEAD(domainlist, domainset);
 struct domainset __read_mostly domainset_prefer[MAXMEMDOM];
 struct domainset __read_mostly domainset_roundrobin;
 
 static uma_zone_t cpuset_zone;
 static uma_zone_t domainset_zone;
 static struct mtx cpuset_lock;
 static struct setlist cpuset_ids;
 static struct domainlist cpuset_domains;
 static struct unrhdr *cpuset_unr;
 static struct cpuset *cpuset_zero, *cpuset_default, *cpuset_kernel;
 static struct domainset domainset0, domainset2;
 
 /* Return the size of cpuset_t at the kernel level */
 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)");
 
 cpuset_t *cpuset_root;
 cpuset_t cpuset_domain[MAXMEMDOM];
 
 static int domainset_valid(const struct domainset *, const struct domainset *);
 
 /*
  * Find the first non-anonymous set starting from 'set'.
  */
 static struct cpuset *
 cpuset_getbase(struct cpuset *set)
 {
 
 	if (set->cs_id == CPUSET_INVALID)
 		set = set->cs_parent;
 	return (set);
 }
 
 /*
  * Walks up the tree from 'set' to find the root.
  */
 static struct cpuset *
 cpuset_getroot(struct cpuset *set)
 {
 
 	while ((set->cs_flags & CPU_SET_ROOT) == 0 && set->cs_parent != NULL)
 		set = set->cs_parent;
 	return (set);
 }
 
 /*
  * Acquire a reference to a cpuset, all pointers must be tracked with refs.
  */
 struct cpuset *
 cpuset_ref(struct cpuset *set)
 {
 
 	refcount_acquire(&set->cs_ref);
 	return (set);
 }
 
 /*
  * Walks up the tree from 'set' to find the root.  Returns the root
  * referenced.
  */
 static struct cpuset *
 cpuset_refroot(struct cpuset *set)
 {
 
 	return (cpuset_ref(cpuset_getroot(set)));
 }
 
 /*
  * Find the first non-anonymous set starting from 'set'.  Returns this set
  * referenced.  May return the passed in set with an extra ref if it is
  * not anonymous. 
  */
 static struct cpuset *
 cpuset_refbase(struct cpuset *set)
 {
 
 	return (cpuset_ref(cpuset_getbase(set)));
 }
 
 /*
  * Release a reference in a context where it is safe to allocate.
  */
 void
 cpuset_rel(struct cpuset *set)
 {
 	cpusetid_t id;
 
 	if (refcount_release(&set->cs_ref) == 0)
 		return;
 	mtx_lock_spin(&cpuset_lock);
 	LIST_REMOVE(set, cs_siblings);
 	id = set->cs_id;
 	if (id != CPUSET_INVALID)
 		LIST_REMOVE(set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 	cpuset_rel(set->cs_parent);
 	uma_zfree(cpuset_zone, set);
 	if (id != CPUSET_INVALID)
 		free_unr(cpuset_unr, id);
 }
 
 /*
  * Deferred release must be used when in a context that is not safe to
  * allocate/free.  This places any unreferenced sets on the list 'head'.
  */
 static void
 cpuset_rel_defer(struct setlist *head, struct cpuset *set)
 {
 
 	if (refcount_release(&set->cs_ref) == 0)
 		return;
 	mtx_lock_spin(&cpuset_lock);
 	LIST_REMOVE(set, cs_siblings);
 	if (set->cs_id != CPUSET_INVALID)
 		LIST_REMOVE(set, cs_link);
 	LIST_INSERT_HEAD(head, set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 }
 
 /*
  * Complete a deferred release.  Removes the set from the list provided to
  * cpuset_rel_defer.
  */
 static void
 cpuset_rel_complete(struct cpuset *set)
 {
 	LIST_REMOVE(set, cs_link);
 	cpuset_rel(set->cs_parent);
 	uma_zfree(cpuset_zone, set);
 }
 
 /*
  * Find a set based on an id.  Returns it with a ref.
  */
 static struct cpuset *
 cpuset_lookup(cpusetid_t setid, struct thread *td)
 {
 	struct cpuset *set;
 
 	if (setid == CPUSET_INVALID)
 		return (NULL);
 	mtx_lock_spin(&cpuset_lock);
 	LIST_FOREACH(set, &cpuset_ids, cs_link)
 		if (set->cs_id == setid)
 			break;
 	if (set)
 		cpuset_ref(set);
 	mtx_unlock_spin(&cpuset_lock);
 
 	KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
 	if (set != NULL && jailed(td->td_ucred)) {
 		struct cpuset *jset, *tset;
 
 		jset = td->td_ucred->cr_prison->pr_cpuset;
 		for (tset = set; tset != NULL; tset = tset->cs_parent)
 			if (tset == jset)
 				break;
 		if (tset == NULL) {
 			cpuset_rel(set);
 			set = NULL;
 		}
 	}
 
 	return (set);
 }
 
 /*
  * Create a set in the space provided in 'set' with the provided parameters.
  * The set is returned with a single ref.  May return EDEADLK if the set
  * will have no valid cpu based on restrictions from the parent.
  */
 static int
 _cpuset_create(struct cpuset *set, struct cpuset *parent,
     const cpuset_t *mask, struct domainset *domain, cpusetid_t id)
 {
 
 	if (domain == NULL)
 		domain = parent->cs_domain;
 	if (mask == NULL)
 		mask = &parent->cs_mask;
 	if (!CPU_OVERLAP(&parent->cs_mask, mask))
 		return (EDEADLK);
 	/* The domain must be prepared ahead of time. */
 	if (!domainset_valid(parent->cs_domain, domain))
 		return (EDEADLK);
 	CPU_COPY(mask, &set->cs_mask);
 	LIST_INIT(&set->cs_children);
 	refcount_init(&set->cs_ref, 1);
 	set->cs_flags = 0;
 	mtx_lock_spin(&cpuset_lock);
 	set->cs_domain = domain;
 	CPU_AND(&set->cs_mask, &parent->cs_mask);
 	set->cs_id = id;
 	set->cs_parent = cpuset_ref(parent);
 	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
 	if (set->cs_id != CPUSET_INVALID)
 		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 
 	return (0);
 }
 
 /*
  * Create a new non-anonymous set with the requested parent and mask.  May
  * return failures if the mask is invalid or a new number can not be
  * allocated.
  */
 static int
 cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
 {
 	struct cpuset *set;
 	cpusetid_t id;
 	int error;
 
 	id = alloc_unr(cpuset_unr);
 	if (id == -1)
 		return (ENFILE);
 	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 	error = _cpuset_create(set, parent, mask, NULL, id);
 	if (error == 0)
 		return (0);
 	free_unr(cpuset_unr, id);
 	uma_zfree(cpuset_zone, set);
 
 	return (error);
 }
 
 static void
 cpuset_freelist_add(struct setlist *list, int count)
 {
 	struct cpuset *set;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		set = uma_zalloc(cpuset_zone, M_ZERO | M_WAITOK);
 		LIST_INSERT_HEAD(list, set, cs_link);
 	}
 }
 
 static void
 cpuset_freelist_init(struct setlist *list, int count)
 {
 
 	LIST_INIT(list);
 	cpuset_freelist_add(list, count);
 }
 
 static void
 cpuset_freelist_free(struct setlist *list)
 {
 	struct cpuset *set;
 
 	while ((set = LIST_FIRST(list)) != NULL) {
 		LIST_REMOVE(set, cs_link);
 		uma_zfree(cpuset_zone, set);
 	}
 }
 
 static void
 domainset_freelist_add(struct domainlist *list, int count)
 {
 	struct domainset *set;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		set = uma_zalloc(domainset_zone, M_ZERO | M_WAITOK);
 		LIST_INSERT_HEAD(list, set, ds_link);
 	}
 }
 
 static void
 domainset_freelist_init(struct domainlist *list, int count)
 {
 
 	LIST_INIT(list);
 	domainset_freelist_add(list, count);
 }
 
 static void
 domainset_freelist_free(struct domainlist *list)
 {
 	struct domainset *set;
 
 	while ((set = LIST_FIRST(list)) != NULL) {
 		LIST_REMOVE(set, ds_link);
 		uma_zfree(domainset_zone, set);
 	}
 }
 
 /* Copy a domainset preserving mask and policy. */
 static void
 domainset_copy(const struct domainset *from, struct domainset *to)
 {
 
 	DOMAINSET_COPY(&from->ds_mask, &to->ds_mask);
 	to->ds_policy = from->ds_policy;
 	to->ds_prefer = from->ds_prefer;
 }
 
 /* Return 1 if mask and policy are equal, otherwise 0. */
 static int
 domainset_equal(const struct domainset *one, const struct domainset *two)
 {
 
 	return (DOMAINSET_CMP(&one->ds_mask, &two->ds_mask) == 0 &&
 	    one->ds_policy == two->ds_policy &&
 	    one->ds_prefer == two->ds_prefer);
 }
 
 /* Return 1 if child is a valid subset of parent. */
 static int
 domainset_valid(const struct domainset *parent, const struct domainset *child)
 {
 	if (child->ds_policy != DOMAINSET_POLICY_PREFER)
 		return (DOMAINSET_SUBSET(&parent->ds_mask, &child->ds_mask));
 	return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
 }
 
 static int
 domainset_restrict(const struct domainset *parent,
     const struct domainset *child)
 {
 	if (child->ds_policy != DOMAINSET_POLICY_PREFER)
 		return (DOMAINSET_OVERLAP(&parent->ds_mask, &child->ds_mask));
 	return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
 }
 
 /*
  * Lookup or create a domainset.  The key is provided in ds_mask and
  * ds_policy.  If the domainset does not yet exist the storage in
  * 'domain' is used to insert.  Otherwise this storage is freed to the
  * domainset_zone and the existing domainset is returned.
  */
 static struct domainset *
 _domainset_create(struct domainset *domain, struct domainlist *freelist)
 {
 	struct domainset *ndomain;
 	int i, j, max;
 
+	KASSERT(domain->ds_cnt <= vm_ndomains,
+	    ("invalid domain count in domainset %p", domain));
+	KASSERT(domain->ds_policy != DOMAINSET_POLICY_PREFER ||
+	    domain->ds_prefer < vm_ndomains,
+	    ("invalid preferred domain in domains %p", domain));
+
 	mtx_lock_spin(&cpuset_lock);
 	LIST_FOREACH(ndomain, &cpuset_domains, ds_link)
 		if (domainset_equal(ndomain, domain))
 			break;
 	/*
 	 * If the domain does not yet exist we insert it and initialize
 	 * various iteration helpers which are not part of the key.
 	 */
 	if (ndomain == NULL) {
 		LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link);
 		domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask);
 		max = DOMAINSET_FLS(&domain->ds_mask) + 1;
 		for (i = 0, j = 0; i < max; i++)
 			if (DOMAINSET_ISSET(i, &domain->ds_mask))
 				domain->ds_order[j++] = i;
 	}
 	mtx_unlock_spin(&cpuset_lock);
 	if (ndomain == NULL)
 		return (domain);
 	if (freelist != NULL)
 		LIST_INSERT_HEAD(freelist, domain, ds_link);
 	else
 		uma_zfree(domainset_zone, domain);
 	return (ndomain);
 	
 }
 
 /*
  * Are any of the domains in the mask empty? If so, silently
  * remove them.  If only empty domains are present, we must
  * return failure.
  */
 static bool
 domainset_empty_vm(struct domainset *domain)
 {
 	int i, max;
 
 	max = DOMAINSET_FLS(&domain->ds_mask) + 1;
 	for (i = 0; i < max; i++) {
 		if (DOMAINSET_ISSET(i, &domain->ds_mask) &&
 		    VM_DOMAIN_EMPTY(i))
 			DOMAINSET_CLR(i, &domain->ds_mask);
 	}
 
 	return (DOMAINSET_EMPTY(&domain->ds_mask));
 }
 
 /*
  * Create or lookup a domainset based on the key held in 'domain'.
  */
 struct domainset *
 domainset_create(const struct domainset *domain)
 {
 	struct domainset *ndomain;
 
 	/*
 	 * Validate the policy.  It must specify a useable policy number with
 	 * only valid domains.  Preferred must include the preferred domain
 	 * in the mask.
 	 */
 	if (domain->ds_policy <= DOMAINSET_POLICY_INVALID ||
 	    domain->ds_policy > DOMAINSET_POLICY_MAX)
 		return (NULL);
 	if (domain->ds_policy == DOMAINSET_POLICY_PREFER &&
 	    !DOMAINSET_ISSET(domain->ds_prefer, &domain->ds_mask))
 		return (NULL);
 	if (!DOMAINSET_SUBSET(&domainset0.ds_mask, &domain->ds_mask))
 		return (NULL);
 	ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO);
 	domainset_copy(domain, ndomain);
 	return _domainset_create(ndomain, NULL);
 }
 
 /*
  * Update thread domainset pointers.
  */
 static void
 domainset_notify(void)
 {
 	struct thread *td;
 	struct proc *p;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			td->td_domain.dr_policy = td->td_cpuset->cs_domain;
 			thread_unlock(td);
 		}
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 	kernel_object->domain.dr_policy = cpuset_kernel->cs_domain;
 }
 
 /*
  * Create a new set that is a subset of a parent.
  */
 static struct domainset *
 domainset_shadow(const struct domainset *pdomain,
     const struct domainset *domain, struct domainlist *freelist)
 {
 	struct domainset *ndomain;
 
 	ndomain = LIST_FIRST(freelist);
 	LIST_REMOVE(ndomain, ds_link);
 
 	/*
 	 * Initialize the key from the request.
 	 */
 	domainset_copy(domain, ndomain);
 
 	/*
 	 * Restrict the key by the parent.
 	 */
 	DOMAINSET_AND(&ndomain->ds_mask, &pdomain->ds_mask);
 
 	return _domainset_create(ndomain, freelist);
 }
 
 /*
  * Recursively check for errors that would occur from applying mask to
  * the tree of sets starting at 'set'.  Checks for sets that would become
  * empty as well as RDONLY flags.
  */
 static int
 cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask)
 {
 	struct cpuset *nset;
 	cpuset_t newmask;
 	int error;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	if (set->cs_flags & CPU_SET_RDONLY)
 		return (EPERM);
 	if (check_mask) {
 		if (!CPU_OVERLAP(&set->cs_mask, mask))
 			return (EDEADLK);
 		CPU_COPY(&set->cs_mask, &newmask);
 		CPU_AND(&newmask, mask);
 	} else
 		CPU_COPY(mask, &newmask);
 	error = 0;
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0)
 			break;
 	return (error);
 }
 
 /*
  * Applies the mask 'mask' without checking for empty sets or permissions.
  */
 static void
 cpuset_update(struct cpuset *set, cpuset_t *mask)
 {
 	struct cpuset *nset;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	CPU_AND(&set->cs_mask, mask);
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		cpuset_update(nset, &set->cs_mask);
 
 	return;
 }
 
 /*
  * Modify the set 'set' to use a copy of the mask provided.  Apply this new
  * mask to restrict all children in the tree.  Checks for validity before
  * applying the changes.
  */
 static int
 cpuset_modify(struct cpuset *set, cpuset_t *mask)
 {
 	struct cpuset *root;
 	int error;
 
 	error = priv_check(curthread, PRIV_SCHED_CPUSET);
 	if (error)
 		return (error);
 	/*
 	 * In case we are called from within the jail
 	 * we do not allow modifying the dedicated root
 	 * cpuset of the jail but may still allow to
 	 * change child sets.
 	 */
 	if (jailed(curthread->td_ucred) &&
 	    set->cs_flags & CPU_SET_ROOT)
 		return (EPERM);
 	/*
 	 * Verify that we have access to this set of
 	 * cpus.
 	 */
 	root = cpuset_getroot(set);
 	mtx_lock_spin(&cpuset_lock);
 	if (root && !CPU_SUBSET(&root->cs_mask, mask)) {
 		error = EINVAL;
 		goto out;
 	}
 	error = cpuset_testupdate(set, mask, 0);
 	if (error)
 		goto out;
 	CPU_COPY(mask, &set->cs_mask);
 	cpuset_update(set, mask);
 out:
 	mtx_unlock_spin(&cpuset_lock);
 
 	return (error);
 }
 
 /*
  * Recursively check for errors that would occur from applying mask to
  * the tree of sets starting at 'set'.  Checks for sets that would become
  * empty as well as RDONLY flags.
  */
 static int
 cpuset_testupdate_domain(struct cpuset *set, struct domainset *dset,
     struct domainset *orig, int *count, int check_mask)
 {
 	struct cpuset *nset;
 	struct domainset *domain;
 	struct domainset newset;
 	int error;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	if (set->cs_flags & CPU_SET_RDONLY)
 		return (EPERM);
 	domain = set->cs_domain;
 	domainset_copy(domain, &newset);
 	if (!domainset_equal(domain, orig)) {
 		if (!domainset_restrict(domain, dset))
 			return (EDEADLK);
 		DOMAINSET_AND(&newset.ds_mask, &dset->ds_mask);
 		/* Count the number of domains that are changing. */
 		(*count)++;
 	}
 	error = 0;
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		if ((error = cpuset_testupdate_domain(nset, &newset, domain,
 		    count, 1)) != 0)
 			break;
 	return (error);
 }
 
 /*
  * Applies the mask 'mask' without checking for empty sets or permissions.
  */
 static void
 cpuset_update_domain(struct cpuset *set, struct domainset *domain,
     struct domainset *orig, struct domainlist *domains)
 {
 	struct cpuset *nset;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	/*
 	 * If this domainset has changed from the parent we must calculate
 	 * a new set.  Otherwise it simply inherits from the parent.  When
 	 * we inherit from the parent we get a new mask and policy.  If the
 	 * set is modified from the parent we keep the policy and only
 	 * update the mask.
 	 */
 	if (set->cs_domain != orig) {
 		orig = set->cs_domain;
 		set->cs_domain = domainset_shadow(domain, orig, domains);
 	} else
 		set->cs_domain = domain;
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		cpuset_update_domain(nset, set->cs_domain, orig, domains);
 
 	return;
 }
 
 /*
  * Modify the set 'set' to use a copy the domainset provided.  Apply this new
  * mask to restrict all children in the tree.  Checks for validity before
  * applying the changes.
  */
 static int
 cpuset_modify_domain(struct cpuset *set, struct domainset *domain)
 {
 	struct domainlist domains;
 	struct domainset temp;
 	struct domainset *dset;
 	struct cpuset *root;
 	int ndomains, needed;
 	int error;
 
 	error = priv_check(curthread, PRIV_SCHED_CPUSET);
 	if (error)
 		return (error);
 	/*
 	 * In case we are called from within the jail
 	 * we do not allow modifying the dedicated root
 	 * cpuset of the jail but may still allow to
 	 * change child sets.
 	 */
 	if (jailed(curthread->td_ucred) &&
 	    set->cs_flags & CPU_SET_ROOT)
 		return (EPERM);
 	domainset_freelist_init(&domains, 0);
 	domain = domainset_create(domain);
 	ndomains = needed = 0;
 	do {
 		if (ndomains < needed) {
 			domainset_freelist_add(&domains, needed - ndomains);
 			ndomains = needed;
 		}
 		root = cpuset_getroot(set);
 		mtx_lock_spin(&cpuset_lock);
 		dset = root->cs_domain;
 		/*
 		 * Verify that we have access to this set of domains.
 		 */
 		if (root && !domainset_valid(dset, domain)) {
 			error = EINVAL;
 			goto out;
 		}
 		/*
 		 * If applying prefer we keep the current set as the fallback.
 		 */
 		if (domain->ds_policy == DOMAINSET_POLICY_PREFER)
 			DOMAINSET_COPY(&set->cs_domain->ds_mask,
 			    &domain->ds_mask);
 		/*
 		 * Determine whether we can apply this set of domains and
 		 * how many new domain structures it will require.
 		 */
 		domainset_copy(domain, &temp);
 		needed = 0;
 		error = cpuset_testupdate_domain(set, &temp, set->cs_domain,
 		    &needed, 0);
 		if (error)
 			goto out;
 	} while (ndomains < needed);
 	dset = set->cs_domain;
 	cpuset_update_domain(set, domain, dset, &domains);
 out:
 	mtx_unlock_spin(&cpuset_lock);
 	domainset_freelist_free(&domains);
 	if (error == 0)
 		domainset_notify();
 
 	return (error);
 }
 
 /*
  * Resolve the 'which' parameter of several cpuset apis.
  *
  * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
  * checks for permission via p_cansched().
  *
  * For WHICH_SET returns a valid set with a new reference.
  *
  * -1 may be supplied for any argument to mean the current proc/thread or
  * the base set of the current thread.  May fail with ESRCH/EPERM.
  */
 int
 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
     struct cpuset **setp)
 {
 	struct cpuset *set;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	*pp = p = NULL;
 	*tdp = td = NULL;
 	*setp = set = NULL;
 	switch (which) {
 	case CPU_WHICH_PID:
 		if (id == -1) {
 			PROC_LOCK(curproc);
 			p = curproc;
 			break;
 		}
 		if ((p = pfind(id)) == NULL)
 			return (ESRCH);
 		break;
 	case CPU_WHICH_TID:
 		if (id == -1) {
 			PROC_LOCK(curproc);
 			p = curproc;
 			td = curthread;
 			break;
 		}
 		td = tdfind(id, -1);
 		if (td == NULL)
 			return (ESRCH);
 		p = td->td_proc;
 		break;
 	case CPU_WHICH_CPUSET:
 		if (id == -1) {
 			thread_lock(curthread);
 			set = cpuset_refbase(curthread->td_cpuset);
 			thread_unlock(curthread);
 		} else
 			set = cpuset_lookup(id, curthread);
 		if (set) {
 			*setp = set;
 			return (0);
 		}
 		return (ESRCH);
 	case CPU_WHICH_JAIL:
 	{
 		/* Find `set' for prison with given id. */
 		struct prison *pr;
 
 		sx_slock(&allprison_lock);
 		pr = prison_find_child(curthread->td_ucred->cr_prison, id);
 		sx_sunlock(&allprison_lock);
 		if (pr == NULL)
 			return (ESRCH);
 		cpuset_ref(pr->pr_cpuset);
 		*setp = pr->pr_cpuset;
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 	case CPU_WHICH_IRQ:
 	case CPU_WHICH_DOMAIN:
 		return (0);
 	default:
 		return (EINVAL);
 	}
 	error = p_cansched(curthread, p);
 	if (error) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (td == NULL)
 		td = FIRST_THREAD_IN_PROC(p);
 	*pp = p;
 	*tdp = td;
 	return (0);
 }
 
 static int
 cpuset_testshadow(struct cpuset *set, const cpuset_t *mask,
     const struct domainset *domain)
 {
 	struct cpuset *parent;
 	struct domainset *dset;
 
 	parent = cpuset_getbase(set);
 	/*
 	 * If we are restricting a cpu mask it must be a subset of the
 	 * parent or invalid CPUs have been specified.
 	 */
 	if (mask != NULL && !CPU_SUBSET(&parent->cs_mask, mask))
 		return (EINVAL);
 
 	/*
 	 * If we are restricting a domain mask it must be a subset of the
 	 * parent or invalid domains have been specified.
 	 */
 	dset = parent->cs_domain;
 	if (domain != NULL && !domainset_valid(dset, domain))
 		return (EINVAL);
 
 	return (0);
 }
 
 /*
  * Create an anonymous set with the provided mask in the space provided by
  * 'nset'.  If the passed in set is anonymous we use its parent otherwise
  * the new set is a child of 'set'.
  */
 static int
 cpuset_shadow(struct cpuset *set, struct cpuset **nsetp,
    const cpuset_t *mask, const struct domainset *domain,
    struct setlist *cpusets, struct domainlist *domains)
 {
 	struct cpuset *parent;
 	struct cpuset *nset;
 	struct domainset *dset;
 	struct domainset *d;
 	int error;
 
 	error = cpuset_testshadow(set, mask, domain);
 	if (error)
 		return (error);
 
 	parent = cpuset_getbase(set);
 	dset = parent->cs_domain;
 	if (mask == NULL)
 		mask = &set->cs_mask;
 	if (domain != NULL)
 		d = domainset_shadow(dset, domain, domains);
 	else
 		d = set->cs_domain;
 	nset = LIST_FIRST(cpusets);
 	error = _cpuset_create(nset, parent, mask, d, CPUSET_INVALID);
 	if (error == 0) {
 		LIST_REMOVE(nset, cs_link);
 		*nsetp = nset;
 	}
 	return (error);
 }
 
 static struct cpuset *
 cpuset_update_thread(struct thread *td, struct cpuset *nset)
 {
 	struct cpuset *tdset;
 
 	tdset = td->td_cpuset;
 	td->td_cpuset = nset;
 	td->td_domain.dr_policy = nset->cs_domain;
 	sched_affinity(td);
 
 	return (tdset);
 }
 
 static int
 cpuset_setproc_test_maskthread(struct cpuset *tdset, cpuset_t *mask,
     struct domainset *domain)
 {
 	struct cpuset *parent;
 
 	parent = cpuset_getbase(tdset);
 	if (mask == NULL)
 		mask = &tdset->cs_mask;
 	if (domain == NULL)
 		domain = tdset->cs_domain;
 	return cpuset_testshadow(parent, mask, domain);
 }
 
 static int
 cpuset_setproc_maskthread(struct cpuset *tdset, cpuset_t *mask,
     struct domainset *domain, struct cpuset **nsetp,
     struct setlist *freelist, struct domainlist *domainlist)
 {
 	struct cpuset *parent;
 
 	parent = cpuset_getbase(tdset);
 	if (mask == NULL)
 		mask = &tdset->cs_mask;
 	if (domain == NULL)
 		domain = tdset->cs_domain;
 	return cpuset_shadow(parent, nsetp, mask, domain, freelist,
 	    domainlist);
 }
 
 static int
 cpuset_setproc_setthread_mask(struct cpuset *tdset, struct cpuset *set,
     cpuset_t *mask, struct domainset *domain)
 {
 	struct cpuset *parent;
 
 	parent = cpuset_getbase(tdset);
 
 	/*
 	 * If the thread restricted its mask then apply that same
 	 * restriction to the new set, otherwise take it wholesale.
 	 */
 	if (CPU_CMP(&tdset->cs_mask, &parent->cs_mask) != 0) {
 		CPU_COPY(&tdset->cs_mask, mask);
 		CPU_AND(mask, &set->cs_mask);
 	} else
 		CPU_COPY(&set->cs_mask, mask);
 
 	/*
 	 * If the thread restricted the domain then we apply the
 	 * restriction to the new set but retain the policy.
 	 */
 	if (tdset->cs_domain != parent->cs_domain) {
 		domainset_copy(tdset->cs_domain, domain);
 		DOMAINSET_AND(&domain->ds_mask, &set->cs_domain->ds_mask);
 	} else
 		domainset_copy(set->cs_domain, domain);
 
 	if (CPU_EMPTY(mask) || DOMAINSET_EMPTY(&domain->ds_mask))
 		return (EDEADLK);
 
 	return (0);
 }
 
 static int
 cpuset_setproc_test_setthread(struct cpuset *tdset, struct cpuset *set)
 {
 	struct domainset domain;
 	cpuset_t mask;
 
 	if (tdset->cs_id != CPUSET_INVALID)
 		return (0);
 	return cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
 }
 
 static int
 cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set,
     struct cpuset **nsetp, struct setlist *freelist,
     struct domainlist *domainlist)
 {
 	struct domainset domain;
 	cpuset_t mask;
 	int error;
 
 	/*
 	 * If we're replacing on a thread that has not constrained the
 	 * original set we can simply accept the new set.
 	 */
 	if (tdset->cs_id != CPUSET_INVALID) {
 		*nsetp = cpuset_ref(set);
 		return (0);
 	}
 	error = cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
 	if (error)
 		return (error);
 
 	return cpuset_shadow(tdset, nsetp, &mask, &domain, freelist,
 	    domainlist);
 }
 
 /*
  * Handle three cases for updating an entire process.
  *
  * 1) Set is non-null.  This reparents all anonymous sets to the provided
  *    set and replaces all non-anonymous td_cpusets with the provided set.
  * 2) Mask is non-null.  This replaces or creates anonymous sets for every
  *    thread with the existing base as a parent.
  * 3) domain is non-null.  This creates anonymous sets for every thread
  *    and replaces the domain set.
  *
  * This is overly complicated because we can't allocate while holding a 
  * spinlock and spinlocks must be held while changing and examining thread
  * state.
  */
 static int
 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
     struct domainset *domain)
 {
 	struct setlist freelist;
 	struct setlist droplist;
 	struct domainlist domainlist;
 	struct cpuset *nset;
 	struct thread *td;
 	struct proc *p;
 	int threads;
 	int nfree;
 	int error;
 
 	/*
 	 * The algorithm requires two passes due to locking considerations.
 	 * 
 	 * 1) Lookup the process and acquire the locks in the required order.
 	 * 2) If enough cpusets have not been allocated release the locks and
 	 *    allocate them.  Loop.
 	 */
 	cpuset_freelist_init(&freelist, 1);
 	domainset_freelist_init(&domainlist, 1);
 	nfree = 1;
 	LIST_INIT(&droplist);
 	nfree = 0;
 	for (;;) {
 		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
 		if (error)
 			goto out;
 		if (nfree >= p->p_numthreads)
 			break;
 		threads = p->p_numthreads;
 		PROC_UNLOCK(p);
 		if (nfree < threads) {
 			cpuset_freelist_add(&freelist, threads - nfree);
 			domainset_freelist_add(&domainlist, threads - nfree);
 			nfree = threads;
 		}
 	}
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * Now that the appropriate locks are held and we have enough cpusets,
 	 * make sure the operation will succeed before applying changes. The
 	 * proc lock prevents td_cpuset from changing between calls.
 	 */
 	error = 0;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		if (set != NULL)
 			error = cpuset_setproc_test_setthread(td->td_cpuset,
 			    set);
 		else
 			error = cpuset_setproc_test_maskthread(td->td_cpuset,
 			    mask, domain);
 		thread_unlock(td);
 		if (error)
 			goto unlock_out;
 	}
 	/*
 	 * Replace each thread's cpuset while using deferred release.  We
 	 * must do this because the thread lock must be held while operating
 	 * on the thread and this limits the type of operations allowed.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		if (set != NULL)
 			error = cpuset_setproc_setthread(td->td_cpuset, set,
 			    &nset, &freelist, &domainlist);
 		else
 			error = cpuset_setproc_maskthread(td->td_cpuset, mask,
 			    domain, &nset, &freelist, &domainlist);
 		if (error) {
 			thread_unlock(td);
 			break;
 		}
 		cpuset_rel_defer(&droplist, cpuset_update_thread(td, nset));
 		thread_unlock(td);
 	}
 unlock_out:
 	PROC_UNLOCK(p);
 out:
 	while ((nset = LIST_FIRST(&droplist)) != NULL)
 		cpuset_rel_complete(nset);
 	cpuset_freelist_free(&freelist);
 	domainset_freelist_free(&domainlist);
 	return (error);
 }
 
 static int
 bitset_strprint(char *buf, size_t bufsiz, const struct bitset *set, int setlen)
 {
 	size_t bytes;
 	int i, once;
 	char *p;
 
 	once = 0;
 	p = buf;
 	for (i = 0; i < __bitset_words(setlen); i++) {
 		if (once != 0) {
 			if (bufsiz < 1)
 				return (0);
 			*p = ',';
 			p++;
 			bufsiz--;
 		} else
 			once = 1;
 		if (bufsiz < sizeof(__STRING(ULONG_MAX)))
 			return (0);
 		bytes = snprintf(p, bufsiz, "%lx", set->__bits[i]);
 		p += bytes;
 		bufsiz -= bytes;
 	}
 	return (p - buf);
 }
 
 static int
 bitset_strscan(struct bitset *set, int setlen, const char *buf)
 {
 	int i, ret;
 	const char *p;
 
 	BIT_ZERO(setlen, set);
 	p = buf;
 	for (i = 0; i < __bitset_words(setlen); i++) {
 		if (*p == ',') {
 			p++;
 			continue;
 		}
 		ret = sscanf(p, "%lx", &set->__bits[i]);
 		if (ret == 0 || ret == -1)
 			break;
 		while (isxdigit(*p))
 			p++;
 	}
 	return (p - buf);
 }
 
 /*
  * Return a string representing a valid layout for a cpuset_t object.
  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
  */
 char *
 cpusetobj_strprint(char *buf, const cpuset_t *set)
 {
 
 	bitset_strprint(buf, CPUSETBUFSIZ, (const struct bitset *)set,
 	    CPU_SETSIZE);
 	return (buf);
 }
 
 /*
  * Build a valid cpuset_t object from a string representation.
  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
  */
 int
 cpusetobj_strscan(cpuset_t *set, const char *buf)
 {
 	char p;
 
 	if (strlen(buf) > CPUSETBUFSIZ - 1)
 		return (-1);
 
 	p = buf[bitset_strscan((struct bitset *)set, CPU_SETSIZE, buf)];
 	if (p != '\0')
 		return (-1);
 
 	return (0);
 }
 
 /*
  * Handle a domainset specifier in the sysctl tree.  A poiner to a pointer to
  * a domainset is in arg1.  If the user specifies a valid domainset the
  * pointer is updated.
  *
  * Format is:
  * hex mask word 0,hex mask word 1,...:decimal policy:decimal preferred
  */
 int
 sysctl_handle_domainset(SYSCTL_HANDLER_ARGS)
 {
 	char buf[DOMAINSETBUFSIZ];
 	struct domainset *dset;
 	struct domainset key;
 	int policy, prefer, error;
 	char *p;
 
 	dset = *(struct domainset **)arg1;
 	error = 0;
 
 	if (dset != NULL) {
 		p = buf + bitset_strprint(buf, DOMAINSETBUFSIZ,
 		    (const struct bitset *)&dset->ds_mask, DOMAINSET_SETSIZE);
 		sprintf(p, ":%d:%d", dset->ds_policy, dset->ds_prefer);
 	} else
 		sprintf(buf, "<NULL>");
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	/*
 	 * Read in and validate the string.
 	 */
 	memset(&key, 0, sizeof(key));
 	p = &buf[bitset_strscan((struct bitset *)&key.ds_mask,
 	    DOMAINSET_SETSIZE, buf)];
 	if (p == buf)
 		return (EINVAL);
 	if (sscanf(p, ":%d:%d", &policy, &prefer) != 2)
 		return (EINVAL);
 	key.ds_policy = policy;
 	key.ds_prefer = prefer;
 
 	/* Domainset_create() validates the policy.*/
 	dset = domainset_create(&key);
 	if (dset == NULL)
 		return (EINVAL);
 	*(struct domainset **)arg1 = dset;
 
 	return (error);
 }
 
 /*
  * Apply an anonymous mask or a domain to a single thread.
  */
 static int
 _cpuset_setthread(lwpid_t id, cpuset_t *mask, struct domainset *domain)
 {
 	struct setlist cpusets;
 	struct domainlist domainlist;
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	cpuset_freelist_init(&cpusets, 1);
 	domainset_freelist_init(&domainlist, domain != NULL);
 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
 	if (error)
 		goto out;
 	set = NULL;
 	thread_lock(td);
 	error = cpuset_shadow(td->td_cpuset, &nset, mask, domain,
 	    &cpusets, &domainlist);
 	if (error == 0)
 		set = cpuset_update_thread(td, nset);
 	thread_unlock(td);
 	PROC_UNLOCK(p);
 	if (set)
 		cpuset_rel(set);
 out:
 	cpuset_freelist_free(&cpusets);
 	domainset_freelist_free(&domainlist);
 	return (error);
 }
 
 /*
  * Apply an anonymous mask to a single thread.
  */
 int
 cpuset_setthread(lwpid_t id, cpuset_t *mask)
 {
 
 	return _cpuset_setthread(id, mask, NULL);
 }
 
 /*
  * Apply new cpumask to the ithread.
  */
 int
 cpuset_setithread(lwpid_t id, int cpu)
 {
 	cpuset_t mask;
 
 	CPU_ZERO(&mask);
 	if (cpu == NOCPU)
 		CPU_COPY(cpuset_root, &mask);
 	else
 		CPU_SET(cpu, &mask);
 	return _cpuset_setthread(id, &mask, NULL);
 }
 
 /*
  * Initialize static domainsets after NUMA information is available.  This is
  * called very early during boot.
  */
 void
 domainset_init(void)
 {
 	struct domainset *dset;
 	int i;
 
 	dset = &domainset_roundrobin;
 	DOMAINSET_COPY(&all_domains, &dset->ds_mask);
 	dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
 	dset->ds_prefer = -1;
 	_domainset_create(dset, NULL);
 
 	for (i = 0; i < vm_ndomains; i++) {
 		dset = &domainset_prefer[i];
 		DOMAINSET_COPY(&all_domains, &dset->ds_mask);
 		dset->ds_policy = DOMAINSET_POLICY_PREFER;
 		dset->ds_prefer = i;
 		_domainset_create(dset, NULL);
 	}
 }
 
 /*
  * Create the domainset for cpuset 0, 1 and cpuset 2.
  */
 void
 domainset_zero(void)
 {
 	struct domainset *dset;
 
 	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
 
 	dset = &domainset0;
 	DOMAINSET_COPY(&all_domains, &dset->ds_mask);
 	dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH;
 	dset->ds_prefer = -1;
 	curthread->td_domain.dr_policy = _domainset_create(dset, NULL);
 
 	domainset_copy(dset, &domainset2);
 	domainset2.ds_policy = DOMAINSET_POLICY_INTERLEAVE;
 	kernel_object->domain.dr_policy = _domainset_create(&domainset2, NULL);
 
 	/* Remove empty domains from the global policies. */
 	LIST_FOREACH(dset, &cpuset_domains, ds_link)
 		(void)domainset_empty_vm(dset);
 }
 
 /*
  * Creates system-wide cpusets and the cpuset for thread0 including three
  * sets:
  * 
  * 0 - The root set which should represent all valid processors in the
  *     system.  It is initially created with a mask of all processors
  *     because we don't know what processors are valid until cpuset_init()
  *     runs.  This set is immutable.
  * 1 - The default set which all processes are a member of until changed.
  *     This allows an administrator to move all threads off of given cpus to
  *     dedicate them to high priority tasks or save power etc.
  * 2 - The kernel set which allows restriction and policy to be applied only
  *     to kernel threads and the kernel_object.
  */
 struct cpuset *
 cpuset_thread0(void)
 {
 	struct cpuset *set;
 	int i;
 	int error __unused;
 
 	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_CACHE, 0);
 	domainset_zone = uma_zcreate("domainset", sizeof(struct domainset),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
 
 	/*
 	 * Create the root system set (0) for the whole machine.  Doesn't use
 	 * cpuset_create() due to NULL parent.
 	 */
 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 	CPU_COPY(&all_cpus, &set->cs_mask);
 	LIST_INIT(&set->cs_children);
 	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
 	set->cs_ref = 1;
 	set->cs_flags = CPU_SET_ROOT | CPU_SET_RDONLY;
 	set->cs_domain = &domainset0;
 	cpuset_zero = set;
 	cpuset_root = &set->cs_mask;
 
 	/*
 	 * Now derive a default (1), modifiable set from that to give out.
 	 */
 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 	error = _cpuset_create(set, cpuset_zero, NULL, NULL, 1);
 	KASSERT(error == 0, ("Error creating default set: %d\n", error));
 	cpuset_default = set;
 	/*
 	 * Create the kernel set (2).
 	 */
 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 	error = _cpuset_create(set, cpuset_zero, NULL, NULL, 2);
 	KASSERT(error == 0, ("Error creating kernel set: %d\n", error));
 	set->cs_domain = &domainset2;
 	cpuset_kernel = set;
 
 	/*
 	 * Initialize the unit allocator. 0 and 1 are allocated above.
 	 */
 	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
 
 	/*
 	 * If MD code has not initialized per-domain cpusets, place all
 	 * CPUs in domain 0.
 	 */
 	for (i = 0; i < MAXMEMDOM; i++)
 		if (!CPU_EMPTY(&cpuset_domain[i]))
 			goto domains_set;
 	CPU_COPY(&all_cpus, &cpuset_domain[0]);
 domains_set:
 
 	return (cpuset_default);
 }
 
 void
 cpuset_kernthread(struct thread *td)
 {
 	struct cpuset *set;
 
 	thread_lock(td);
 	set = td->td_cpuset;
 	td->td_cpuset = cpuset_ref(cpuset_kernel);
 	thread_unlock(td);
 	cpuset_rel(set);
 }
 
 /*
  * Create a cpuset, which would be cpuset_create() but
  * mark the new 'set' as root.
  *
  * We are not going to reparent the td to it.  Use cpuset_setproc_update_set()
  * for that.
  *
  * In case of no error, returns the set in *setp locked with a reference.
  */
 int
 cpuset_create_root(struct prison *pr, struct cpuset **setp)
 {
 	struct cpuset *set;
 	int error;
 
 	KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
 	KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
 
 	error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
 	if (error)
 		return (error);
 
 	KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data",
 	    __func__, __LINE__));
 
 	/* Mark the set as root. */
 	set = *setp;
 	set->cs_flags |= CPU_SET_ROOT;
 
 	return (0);
 }
 
 int
 cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
 {
 	int error;
 
 	KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
 	KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
 
 	cpuset_ref(set);
 	error = cpuset_setproc(p->p_pid, set, NULL, NULL);
 	if (error)
 		return (error);
 	cpuset_rel(set);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_args {
 	cpusetid_t	*setid;
 };
 #endif
 int
 sys_cpuset(struct thread *td, struct cpuset_args *uap)
 {
 	struct cpuset *root;
 	struct cpuset *set;
 	int error;
 
 	thread_lock(td);
 	root = cpuset_refroot(td->td_cpuset);
 	thread_unlock(td);
 	error = cpuset_create(&set, root, &root->cs_mask);
 	cpuset_rel(root);
 	if (error)
 		return (error);
 	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
 	if (error == 0)
 		error = cpuset_setproc(-1, set, NULL, NULL);
 	cpuset_rel(set);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_setid_args {
 	cpuwhich_t	which;
 	id_t		id;
 	cpusetid_t	setid;
 };
 #endif
 int
 sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
 {
 
 	return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid));
 }
 
 int
 kern_cpuset_setid(struct thread *td, cpuwhich_t which,
     id_t id, cpusetid_t setid)
 {
 	struct cpuset *set;
 	int error;
 
 	/*
 	 * Presently we only support per-process sets.
 	 */
 	if (which != CPU_WHICH_PID)
 		return (EINVAL);
 	set = cpuset_lookup(setid, td);
 	if (set == NULL)
 		return (ESRCH);
 	error = cpuset_setproc(id, set, NULL, NULL);
 	cpuset_rel(set);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_getid_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	cpusetid_t	*setid;
 };
 #endif
 int
 sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
 {
 
 	return (kern_cpuset_getid(td, uap->level, uap->which, uap->id,
 	    uap->setid));
 }
 
 int
 kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, cpusetid_t *setid)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *ttd;
 	struct proc *p;
 	cpusetid_t tmpid;
 	int error;
 
 	if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET)
 		return (EINVAL);
 	error = cpuset_which(which, id, &p, &ttd, &set);
 	if (error)
 		return (error);
 	switch (which) {
 	case CPU_WHICH_TID:
 	case CPU_WHICH_PID:
 		thread_lock(ttd);
 		set = cpuset_refbase(ttd->td_cpuset);
 		thread_unlock(ttd);
 		PROC_UNLOCK(p);
 		break;
 	case CPU_WHICH_CPUSET:
 	case CPU_WHICH_JAIL:
 		break;
 	case CPU_WHICH_IRQ:
 	case CPU_WHICH_DOMAIN:
 		return (EINVAL);
 	}
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 		nset = cpuset_refroot(set);
 		cpuset_rel(set);
 		set = nset;
 		break;
 	case CPU_LEVEL_CPUSET:
 		break;
 	case CPU_LEVEL_WHICH:
 		break;
 	}
 	tmpid = set->cs_id;
 	cpuset_rel(set);
 	if (error == 0)
 		error = copyout(&tmpid, setid, sizeof(tmpid));
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_getaffinity_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		cpusetsize;
 	cpuset_t	*mask;
 };
 #endif
 int
 sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
 {
 
 	return (kern_cpuset_getaffinity(td, uap->level, uap->which,
 	    uap->id, uap->cpusetsize, uap->mask));
 }
 
 int
 kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t cpusetsize, cpuset_t *maskp)
 {
 	struct thread *ttd;
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct proc *p;
 	cpuset_t *mask;
 	int error;
 	size_t size;
 
 	if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
 		return (ERANGE);
 	/* In Capability mode, you can only get your own CPU set. */
 	if (IN_CAPABILITY_MODE(td)) {
 		if (level != CPU_LEVEL_WHICH)
 			return (ECAPMODE);
 		if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 			return (ECAPMODE);
 		if (id != -1)
 			return (ECAPMODE);
 	}
 	size = cpusetsize;
 	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
 	error = cpuset_which(which, id, &p, &ttd, &set);
 	if (error)
 		goto out;
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		CPU_COPY(&nset->cs_mask, mask);
 		cpuset_rel(nset);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			thread_lock(ttd);
 			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_PID:
 			FOREACH_THREAD_IN_PROC(p, ttd) {
 				thread_lock(ttd);
 				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
 				thread_unlock(ttd);
 			}
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			CPU_COPY(&set->cs_mask, mask);
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 			error = intr_getaffinity(id, which, mask);
 			break;
 		case CPU_WHICH_DOMAIN:
 			if (id < 0 || id >= MAXMEMDOM)
 				error = ESRCH;
 			else
 				CPU_COPY(&cpuset_domain[id], mask);
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (set)
 		cpuset_rel(set);
 	if (p)
 		PROC_UNLOCK(p);
 	if (error == 0)
 		error = copyout(mask, maskp, size);
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_setaffinity_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		cpusetsize;
 	const cpuset_t	*mask;
 };
 #endif
 int
 sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
 {
 
 	return (kern_cpuset_setaffinity(td, uap->level, uap->which,
 	    uap->id, uap->cpusetsize, uap->mask));
 }
 
 int
 kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t cpusetsize, const cpuset_t *maskp)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *ttd;
 	struct proc *p;
 	cpuset_t *mask;
 	int error;
 
 	if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
 		return (ERANGE);
 	/* In Capability mode, you can only set your own CPU set. */
 	if (IN_CAPABILITY_MODE(td)) {
 		if (level != CPU_LEVEL_WHICH)
 			return (ECAPMODE);
 		if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 			return (ECAPMODE);
 		if (id != -1)
 			return (ECAPMODE);
 	}
 	mask = malloc(cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
 	error = copyin(maskp, mask, cpusetsize);
 	if (error)
 		goto out;
 	/*
 	 * Verify that no high bits are set.
 	 */
 	if (cpusetsize > sizeof(cpuset_t)) {
 		char *end;
 		char *cp;
 
 		end = cp = (char *)&mask->__bits;
 		end += cpusetsize;
 		cp += sizeof(cpuset_t);
 		while (cp != end)
 			if (*cp++ != 0) {
 				error = EINVAL;
 				goto out;
 			}
 
 	}
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		error = cpuset_which(which, id, &p, &ttd, &set);
 		if (error)
 			break;
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			PROC_UNLOCK(p);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		error = cpuset_modify(nset, mask);
 		cpuset_rel(nset);
 		cpuset_rel(set);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			error = cpuset_setthread(id, mask);
 			break;
 		case CPU_WHICH_PID:
 			error = cpuset_setproc(id, NULL, mask, NULL);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			error = cpuset_which(which, id, &p, &ttd, &set);
 			if (error == 0) {
 				error = cpuset_modify(set, mask);
 				cpuset_rel(set);
 			}
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 			error = intr_setaffinity(id, which, mask);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_getdomain_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		domainsetsize;
 	domainset_t	*mask;
 	int 		*policy;
 };
 #endif
 int
 sys_cpuset_getdomain(struct thread *td, struct cpuset_getdomain_args *uap)
 {
 
 	return (kern_cpuset_getdomain(td, uap->level, uap->which,
 	    uap->id, uap->domainsetsize, uap->mask, uap->policy));
 }
 
 int
 kern_cpuset_getdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t domainsetsize, domainset_t *maskp, int *policyp)
 {
 	struct domainset outset;
 	struct thread *ttd;
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct domainset *dset;
 	struct proc *p;
 	domainset_t *mask;
 	int error;
 
 	if (domainsetsize < sizeof(domainset_t) ||
 	    domainsetsize > DOMAINSET_MAXSIZE / NBBY)
 		return (ERANGE);
 	/* In Capability mode, you can only get your own domain set. */
 	if (IN_CAPABILITY_MODE(td)) {
 		if (level != CPU_LEVEL_WHICH)
 			return (ECAPMODE);
 		if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 			return (ECAPMODE);
 		if (id != -1)
 			return (ECAPMODE);
 	}
 	mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
 	bzero(&outset, sizeof(outset));
 	error = cpuset_which(which, id, &p, &ttd, &set);
 	if (error)
 		goto out;
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		domainset_copy(nset->cs_domain, &outset);
 		cpuset_rel(nset);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			thread_lock(ttd);
 			domainset_copy(ttd->td_cpuset->cs_domain, &outset);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_PID:
 			FOREACH_THREAD_IN_PROC(p, ttd) {
 				thread_lock(ttd);
 				dset = ttd->td_cpuset->cs_domain;
 				/* Show all domains in the proc. */
 				DOMAINSET_OR(&outset.ds_mask, &dset->ds_mask);
 				/* Last policy wins. */
 				outset.ds_policy = dset->ds_policy;
 				outset.ds_prefer = dset->ds_prefer;
 				thread_unlock(ttd);
 			}
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			domainset_copy(set->cs_domain, &outset);
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (set)
 		cpuset_rel(set);
 	if (p)
 		PROC_UNLOCK(p);
 	/*
 	 * Translate prefer into a set containing only the preferred domain,
 	 * not the entire fallback set.
 	 */
 	if (outset.ds_policy == DOMAINSET_POLICY_PREFER) {
 		DOMAINSET_ZERO(&outset.ds_mask);
 		DOMAINSET_SET(outset.ds_prefer, &outset.ds_mask);
 	}
 	DOMAINSET_COPY(&outset.ds_mask, mask);
 	if (error == 0)
 		error = copyout(mask, maskp, domainsetsize);
 	if (error == 0)
 		if (suword32(policyp, outset.ds_policy) != 0)
 			error = EFAULT;
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_setdomain_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		domainsetsize;
 	domainset_t	*mask;
 	int 		policy;
 };
 #endif
 int
 sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap)
 {
 
 	return (kern_cpuset_setdomain(td, uap->level, uap->which,
 	    uap->id, uap->domainsetsize, uap->mask, uap->policy));
 }
 
 int
 kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t domainsetsize, const domainset_t *maskp, int policy)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *ttd;
 	struct proc *p;
 	struct domainset domain;
 	domainset_t *mask;
 	int error;
 
 	if (domainsetsize < sizeof(domainset_t) ||
 	    domainsetsize > DOMAINSET_MAXSIZE / NBBY)
 		return (ERANGE);
 	if (policy <= DOMAINSET_POLICY_INVALID ||
 	    policy > DOMAINSET_POLICY_MAX)
 		return (EINVAL);
 	/* In Capability mode, you can only set your own CPU set. */
 	if (IN_CAPABILITY_MODE(td)) {
 		if (level != CPU_LEVEL_WHICH)
 			return (ECAPMODE);
 		if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 			return (ECAPMODE);
 		if (id != -1)
 			return (ECAPMODE);
 	}
 	memset(&domain, 0, sizeof(domain));
 	mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
 	error = copyin(maskp, mask, domainsetsize);
 	if (error)
 		goto out;
 	/*
 	 * Verify that no high bits are set.
 	 */
 	if (domainsetsize > sizeof(domainset_t)) {
 		char *end;
 		char *cp;
 
 		end = cp = (char *)&mask->__bits;
 		end += domainsetsize;
 		cp += sizeof(domainset_t);
 		while (cp != end)
 			if (*cp++ != 0) {
 				error = EINVAL;
 				goto out;
 			}
 
 	}
 	DOMAINSET_COPY(mask, &domain.ds_mask);
 	domain.ds_policy = policy;
 
 	/* Translate preferred policy into a mask and fallback. */
 	if (policy == DOMAINSET_POLICY_PREFER) {
 		/* Only support a single preferred domain. */
 		if (DOMAINSET_COUNT(&domain.ds_mask) != 1) {
 			error = EINVAL;
 			goto out;
 		}
 		domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1;
 		/* This will be constrained by domainset_shadow(). */
 		DOMAINSET_FILL(&domain.ds_mask);
 	}
 
 	/*
 	 *  When given an impossible policy, fall back to interleaving
 	 *  across all domains
 	 */
 	if (domainset_empty_vm(&domain))
 		domainset_copy(&domainset2, &domain);
 
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		error = cpuset_which(which, id, &p, &ttd, &set);
 		if (error)
 			break;
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			PROC_UNLOCK(p);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		error = cpuset_modify_domain(nset, &domain);
 		cpuset_rel(nset);
 		cpuset_rel(set);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			error = _cpuset_setthread(id, NULL, &domain);
 			break;
 		case CPU_WHICH_PID:
 			error = cpuset_setproc(id, NULL, NULL, &domain);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			error = cpuset_which(which, id, &p, &ttd, &set);
 			if (error == 0) {
 				error = cpuset_modify_domain(set, &domain);
 				cpuset_rel(set);
 			}
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifdef DDB
 
 static void
 ddb_display_bitset(const struct bitset *set, int size)
 {
 	int bit, once;
 
 	for (once = 0, bit = 0; bit < size; bit++) {
 		if (CPU_ISSET(bit, set)) {
 			if (once == 0) {
 				db_printf("%d", bit);
 				once = 1;
 			} else  
 				db_printf(",%d", bit);
 		}
 	}
 	if (once == 0)
 		db_printf("<none>");
 }
 
 void
 ddb_display_cpuset(const cpuset_t *set)
 {
 	ddb_display_bitset((const struct bitset *)set, CPU_SETSIZE);
 }
 
 static void
 ddb_display_domainset(const domainset_t *set)
 {
 	ddb_display_bitset((const struct bitset *)set, DOMAINSET_SETSIZE);
 }
 
 DB_SHOW_COMMAND(cpusets, db_show_cpusets)
 {
 	struct cpuset *set;
 
 	LIST_FOREACH(set, &cpuset_ids, cs_link) {
 		db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
 		    set, set->cs_id, set->cs_ref, set->cs_flags,
 		    (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
 		db_printf("  cpu mask=");
 		ddb_display_cpuset(&set->cs_mask);
 		db_printf("\n");
 		db_printf("  domain policy %d prefer %d mask=",
 		    set->cs_domain->ds_policy, set->cs_domain->ds_prefer);
 		ddb_display_domainset(&set->cs_domain->ds_mask);
 		db_printf("\n");
 		if (db_pager_quit)
 			break;
 	}
 }
 
 DB_SHOW_COMMAND(domainsets, db_show_domainsets)
 {
 	struct domainset *set;
 
 	LIST_FOREACH(set, &cpuset_domains, ds_link) {
 		db_printf("set=%p policy %d prefer %d cnt %d\n",
 		    set, set->ds_policy, set->ds_prefer, set->ds_cnt);
 		db_printf("  mask =");
 		ddb_display_domainset(&set->ds_mask);
 		db_printf("\n");
 	}
 }
 #endif /* DDB */
Index: head/sys/vm/vm_phys.c
===================================================================
--- head/sys/vm/vm_phys.c	(revision 339615)
+++ head/sys/vm/vm_phys.c	(revision 339616)
@@ -1,1432 +1,1443 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2006 Rice University
  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Alan L. Cox,
  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  *	Physical memory system implementation
  *
  * Any external functions defined by this module are only to be used by the
  * virtual memory system.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domainset.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/tree.h>
 #include <sys/vmmeter.h>
 #include <sys/seq.h>
 
 #include <ddb/ddb.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_pagequeue.h>
 
 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
     "Too many physsegs.");
 
 #ifdef NUMA
 struct mem_affinity __read_mostly *mem_affinity;
 int __read_mostly *mem_locality;
 #endif
 
 int __read_mostly vm_ndomains = 1;
 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1);
 
 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX];
 int __read_mostly vm_phys_nsegs;
 
 struct vm_phys_fictitious_seg;
 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
     struct vm_phys_fictitious_seg *);
 
 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
     RB_INITIALIZER(_vm_phys_fictitious_tree);
 
 struct vm_phys_fictitious_seg {
 	RB_ENTRY(vm_phys_fictitious_seg) node;
 	/* Memory region data */
 	vm_paddr_t	start;
 	vm_paddr_t	end;
 	vm_page_t	first_page;
 };
 
 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
     vm_phys_fictitious_cmp);
 
 static struct rwlock_padalign vm_phys_fictitious_reg_lock;
 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
 
 static struct vm_freelist __aligned(CACHE_LINE_SIZE)
     vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
 
 static int __read_mostly vm_nfreelists;
 
 /*
  * Provides the mapping from VM_FREELIST_* to free list indices (flind).
  */
 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
 
 CTASSERT(VM_FREELIST_DEFAULT == 0);
 
 #ifdef VM_FREELIST_DMA32
 #define	VM_DMA32_BOUNDARY	((vm_paddr_t)1 << 32)
 #endif
 
 /*
  * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
  * the ordering of the free list boundaries.
  */
 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
 #endif
 
 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
 SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
 
 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
 
 #ifdef NUMA
 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
 SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info");
 #endif
 
 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
     &vm_ndomains, 0, "Number of physical memory domains available.");
 
 static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg,
     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
     vm_paddr_t boundary);
 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
     int order, int tail);
 
 /*
  * Red-black tree helpers for vm fictitious range management.
  */
 static inline int
 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
     struct vm_phys_fictitious_seg *range)
 {
 
 	KASSERT(range->start != 0 && range->end != 0,
 	    ("Invalid range passed on search for vm_fictitious page"));
 	if (p->start >= range->end)
 		return (1);
 	if (p->start < range->start)
 		return (-1);
 
 	return (0);
 }
 
 static int
 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
     struct vm_phys_fictitious_seg *p2)
 {
 
 	/* Check if this is a search for a page */
 	if (p1->end == 0)
 		return (vm_phys_fictitious_in_range(p1, p2));
 
 	KASSERT(p2->end != 0,
     ("Invalid range passed as second parameter to vm fictitious comparison"));
 
 	/* Searching to add a new range */
 	if (p1->end <= p2->start)
 		return (-1);
 	if (p1->start >= p2->end)
 		return (1);
 
 	panic("Trying to add overlapping vm fictitious ranges:\n"
 	    "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
 	    (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
 }
 
 int
 vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high)
 {
 #ifdef NUMA
 	domainset_t mask;
 	int i;
 
 	if (vm_ndomains == 1 || mem_affinity == NULL)
 		return (0);
 
 	DOMAINSET_ZERO(&mask);
 	/*
 	 * Check for any memory that overlaps low, high.
 	 */
 	for (i = 0; mem_affinity[i].end != 0; i++)
 		if (mem_affinity[i].start <= high &&
 		    mem_affinity[i].end >= low)
 			DOMAINSET_SET(mem_affinity[i].domain, &mask);
 	if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask))
 		return (prefer);
 	if (DOMAINSET_EMPTY(&mask))
 		panic("vm_phys_domain_match:  Impossible constraint");
 	return (DOMAINSET_FFS(&mask) - 1);
 #else
 	return (0);
 #endif
 }
 
 /*
  * Outputs the state of the physical memory allocator, specifically,
  * the amount of physical memory in each free list.
  */
 static int
 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	struct vm_freelist *fl;
 	int dom, error, flind, oind, pind;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
 	for (dom = 0; dom < vm_ndomains; dom++) {
 		sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
 		for (flind = 0; flind < vm_nfreelists; flind++) {
 			sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
 			    "\n  ORDER (SIZE)  |  NUMBER"
 			    "\n              ", flind);
 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
 				sbuf_printf(&sbuf, "  |  POOL %d", pind);
 			sbuf_printf(&sbuf, "\n--            ");
 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
 				sbuf_printf(&sbuf, "-- --      ");
 			sbuf_printf(&sbuf, "--\n");
 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
 				sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
 				    1 << (PAGE_SHIFT - 10 + oind));
 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 				fl = vm_phys_free_queues[dom][flind][pind];
 					sbuf_printf(&sbuf, "  |  %6d",
 					    fl[oind].lcnt);
 				}
 				sbuf_printf(&sbuf, "\n");
 			}
 		}
 	}
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 /*
  * Outputs the set of physical memory segments.
  */
 static int
 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	struct vm_phys_seg *seg;
 	int error, segind;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
 		seg = &vm_phys_segs[segind];
 		sbuf_printf(&sbuf, "start:     %#jx\n",
 		    (uintmax_t)seg->start);
 		sbuf_printf(&sbuf, "end:       %#jx\n",
 		    (uintmax_t)seg->end);
 		sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
 		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
 	}
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 /*
  * Return affinity, or -1 if there's no affinity information.
  */
 int
 vm_phys_mem_affinity(int f, int t)
 {
 
 #ifdef NUMA
 	if (mem_locality == NULL)
 		return (-1);
 	if (f >= vm_ndomains || t >= vm_ndomains)
 		return (-1);
 	return (mem_locality[f * vm_ndomains + t]);
 #else
 	return (-1);
 #endif
 }
 
 #ifdef NUMA
 /*
  * Outputs the VM locality table.
  */
 static int
 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	int error, i, j;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 
 	sbuf_printf(&sbuf, "\n");
 
 	for (i = 0; i < vm_ndomains; i++) {
 		sbuf_printf(&sbuf, "%d: ", i);
 		for (j = 0; j < vm_ndomains; j++) {
 			sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
 		}
 		sbuf_printf(&sbuf, "\n");
 	}
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 #endif
 
 static void
 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
 {
 
 	m->order = order;
 	if (tail)
 		TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
 	else
 		TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
 	fl[order].lcnt++;
 }
 
 static void
 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
 {
 
 	TAILQ_REMOVE(&fl[order].pl, m, listq);
 	fl[order].lcnt--;
 	m->order = VM_NFREEORDER;
 }
 
 /*
  * Create a physical memory segment.
  */
 static void
 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
 {
 	struct vm_phys_seg *seg;
 
 	KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
 	    ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
 	KASSERT(domain >= 0 && domain < vm_ndomains,
 	    ("vm_phys_create_seg: invalid domain provided"));
 	seg = &vm_phys_segs[vm_phys_nsegs++];
 	while (seg > vm_phys_segs && (seg - 1)->start >= end) {
 		*seg = *(seg - 1);
 		seg--;
 	}
 	seg->start = start;
 	seg->end = end;
 	seg->domain = domain;
 }
 
 static void
 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
 {
 #ifdef NUMA
 	int i;
 
 	if (mem_affinity == NULL) {
 		_vm_phys_create_seg(start, end, 0);
 		return;
 	}
 
 	for (i = 0;; i++) {
 		if (mem_affinity[i].end == 0)
 			panic("Reached end of affinity info");
 		if (mem_affinity[i].end <= start)
 			continue;
 		if (mem_affinity[i].start > start)
 			panic("No affinity info for start %jx",
 			    (uintmax_t)start);
 		if (mem_affinity[i].end >= end) {
 			_vm_phys_create_seg(start, end,
 			    mem_affinity[i].domain);
 			break;
 		}
 		_vm_phys_create_seg(start, mem_affinity[i].end,
 		    mem_affinity[i].domain);
 		start = mem_affinity[i].end;
 	}
 #else
 	_vm_phys_create_seg(start, end, 0);
 #endif
 }
 
 /*
  * Add a physical memory segment.
  */
 void
 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
 {
 	vm_paddr_t paddr;
 
 	KASSERT((start & PAGE_MASK) == 0,
 	    ("vm_phys_define_seg: start is not page aligned"));
 	KASSERT((end & PAGE_MASK) == 0,
 	    ("vm_phys_define_seg: end is not page aligned"));
 
 	/*
 	 * Split the physical memory segment if it spans two or more free
 	 * list boundaries.
 	 */
 	paddr = start;
 #ifdef	VM_FREELIST_LOWMEM
 	if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
 		vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
 		paddr = VM_LOWMEM_BOUNDARY;
 	}
 #endif
 #ifdef	VM_FREELIST_DMA32
 	if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
 		vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
 		paddr = VM_DMA32_BOUNDARY;
 	}
 #endif
 	vm_phys_create_seg(paddr, end);
 }
 
 /*
  * Initialize the physical memory allocator.
  *
  * Requires that vm_page_array is initialized!
  */
 void
 vm_phys_init(void)
 {
 	struct vm_freelist *fl;
 	struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg;
 	u_long npages;
 	int dom, flind, freelist, oind, pind, segind;
 
 	/*
 	 * Compute the number of free lists, and generate the mapping from the
 	 * manifest constants VM_FREELIST_* to the free list indices.
 	 *
 	 * Initially, the entries of vm_freelist_to_flind[] are set to either
 	 * 0 or 1 to indicate which free lists should be created.
 	 */
 	npages = 0;
 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
 		seg = &vm_phys_segs[segind];
 #ifdef	VM_FREELIST_LOWMEM
 		if (seg->end <= VM_LOWMEM_BOUNDARY)
 			vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
 		else
 #endif
 #ifdef	VM_FREELIST_DMA32
 		if (
 #ifdef	VM_DMA32_NPAGES_THRESHOLD
 		    /*
 		     * Create the DMA32 free list only if the amount of
 		     * physical memory above physical address 4G exceeds the
 		     * given threshold.
 		     */
 		    npages > VM_DMA32_NPAGES_THRESHOLD &&
 #endif
 		    seg->end <= VM_DMA32_BOUNDARY)
 			vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
 		else
 #endif
 		{
 			npages += atop(seg->end - seg->start);
 			vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
 		}
 	}
 	/* Change each entry into a running total of the free lists. */
 	for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
 		vm_freelist_to_flind[freelist] +=
 		    vm_freelist_to_flind[freelist - 1];
 	}
 	vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
 	KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
 	/* Change each entry into a free list index. */
 	for (freelist = 0; freelist < VM_NFREELIST; freelist++)
 		vm_freelist_to_flind[freelist]--;
 
 	/*
 	 * Initialize the first_page and free_queues fields of each physical
 	 * memory segment.
 	 */
 #ifdef VM_PHYSSEG_SPARSE
 	npages = 0;
 #endif
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		seg = &vm_phys_segs[segind];
 #ifdef VM_PHYSSEG_SPARSE
 		seg->first_page = &vm_page_array[npages];
 		npages += atop(seg->end - seg->start);
 #else
 		seg->first_page = PHYS_TO_VM_PAGE(seg->start);
 #endif
 #ifdef	VM_FREELIST_LOWMEM
 		if (seg->end <= VM_LOWMEM_BOUNDARY) {
 			flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
 			KASSERT(flind >= 0,
 			    ("vm_phys_init: LOWMEM flind < 0"));
 		} else
 #endif
 #ifdef	VM_FREELIST_DMA32
 		if (seg->end <= VM_DMA32_BOUNDARY) {
 			flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
 			KASSERT(flind >= 0,
 			    ("vm_phys_init: DMA32 flind < 0"));
 		} else
 #endif
 		{
 			flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
 			KASSERT(flind >= 0,
 			    ("vm_phys_init: DEFAULT flind < 0"));
 		}
 		seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
 	}
 
 	/*
 	 * Coalesce physical memory segments that are contiguous and share the
 	 * same per-domain free queues.
 	 */
 	prev_seg = vm_phys_segs;
 	seg = &vm_phys_segs[1];
 	end_seg = &vm_phys_segs[vm_phys_nsegs];
 	while (seg < end_seg) {
 		if (prev_seg->end == seg->start &&
 		    prev_seg->free_queues == seg->free_queues) {
 			prev_seg->end = seg->end;
 			KASSERT(prev_seg->domain == seg->domain,
 			    ("vm_phys_init: free queues cannot span domains"));
 			vm_phys_nsegs--;
 			end_seg--;
 			for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++)
 				*tmp_seg = *(tmp_seg + 1);
 		} else {
 			prev_seg = seg;
 			seg++;
 		}
 	}
 
 	/*
 	 * Initialize the free queues.
 	 */
 	for (dom = 0; dom < vm_ndomains; dom++) {
 		for (flind = 0; flind < vm_nfreelists; flind++) {
 			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 				fl = vm_phys_free_queues[dom][flind][pind];
 				for (oind = 0; oind < VM_NFREEORDER; oind++)
 					TAILQ_INIT(&fl[oind].pl);
 			}
 		}
 	}
 
 	rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
 }
 
 /*
  * Register info about the NUMA topology of the system.
  *
  * Invoked by platform-dependent code prior to vm_phys_init().
  */
 void
 vm_phys_register_domains(int ndomains, struct mem_affinity *affinity,
     int *locality)
 {
 #ifdef NUMA
-	int i;
+	int d, i;
 
-	vm_ndomains = ndomains;
-	mem_affinity = affinity;
-	mem_locality = locality;
+	/*
+	 * For now the only override value that we support is 1, which
+	 * effectively disables NUMA-awareness in the allocators.
+	 */
+	d = 0;
+	TUNABLE_INT_FETCH("vm.numa.disabled", &d);
+	if (d)
+		ndomains = 1;
+
+	if (ndomains > 1) {
+		vm_ndomains = ndomains;
+		mem_affinity = affinity;
+		mem_locality = locality;
+	}
 
 	for (i = 0; i < vm_ndomains; i++)
 		DOMAINSET_SET(i, &all_domains);
 
 	domainset_init();
 #else
 	(void)ndomains;
 	(void)affinity;
 	(void)locality;
 #endif
 }
 
 /*
  * Split a contiguous, power of two-sized set of physical pages.
  *
  * When this function is called by a page allocation function, the caller
  * should request insertion at the head unless the order [order, oind) queues
  * are known to be empty.  The objective being to reduce the likelihood of
  * long-term fragmentation by promoting contemporaneous allocation and
  * (hopefully) deallocation.
  */
 static __inline void
 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order,
     int tail)
 {
 	vm_page_t m_buddy;
 
 	while (oind > order) {
 		oind--;
 		m_buddy = &m[1 << oind];
 		KASSERT(m_buddy->order == VM_NFREEORDER,
 		    ("vm_phys_split_pages: page %p has unexpected order %d",
 		    m_buddy, m_buddy->order));
 		vm_freelist_add(fl, m_buddy, oind, tail);
         }
 }
 
 /*
  * Add the physical pages [m, m + npages) at the end of a power-of-two aligned
  * and sized set to the specified free list.
  *
  * When this function is called by a page allocation function, the caller
  * should request insertion at the head unless the lower-order queues are
  * known to be empty.  The objective being to reduce the likelihood of long-
  * term fragmentation by promoting contemporaneous allocation and (hopefully)
  * deallocation.
  *
  * The physical page m's buddy must not be free.
  */
 static void
 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail)
 {
 	u_int n;
 	int order;
 
 	KASSERT(npages > 0, ("vm_phys_enq_range: npages is 0"));
 	KASSERT(((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) &
 	    ((PAGE_SIZE << (fls(npages) - 1)) - 1)) == 0,
 	    ("vm_phys_enq_range: page %p and npages %u are misaligned",
 	    m, npages));
 	do {
 		KASSERT(m->order == VM_NFREEORDER,
 		    ("vm_phys_enq_range: page %p has unexpected order %d",
 		    m, m->order));
 		order = ffs(npages) - 1;
 		KASSERT(order < VM_NFREEORDER,
 		    ("vm_phys_enq_range: order %d is out of range", order));
 		vm_freelist_add(fl, m, order, tail);
 		n = 1 << order;
 		m += n;
 		npages -= n;
 	} while (npages > 0);
 }
 
 /*
  * Tries to allocate the specified number of pages from the specified pool
  * within the specified domain.  Returns the actual number of allocated pages
  * and a pointer to each page through the array ma[].
  *
  * The returned pages may not be physically contiguous.  However, in contrast
  * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0),
  * calling this function once to allocate the desired number of pages will
  * avoid wasted time in vm_phys_split_pages().
  *
  * The free page queues for the specified domain must be locked.
  */
 int
 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
 {
 	struct vm_freelist *alt, *fl;
 	vm_page_t m;
 	int avail, end, flind, freelist, i, need, oind, pind;
 
 	KASSERT(domain >= 0 && domain < vm_ndomains,
 	    ("vm_phys_alloc_npages: domain %d is out of range", domain));
 	KASSERT(pool < VM_NFREEPOOL,
 	    ("vm_phys_alloc_npages: pool %d is out of range", pool));
 	KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
 	    ("vm_phys_alloc_npages: npages %d is out of range", npages));
 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
 	i = 0;
 	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
 		flind = vm_freelist_to_flind[freelist];
 		if (flind < 0)
 			continue;
 		fl = vm_phys_free_queues[domain][flind][pool];
 		for (oind = 0; oind < VM_NFREEORDER; oind++) {
 			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
 				vm_freelist_rem(fl, m, oind);
 				avail = 1 << oind;
 				need = imin(npages - i, avail);
 				for (end = i + need; i < end;)
 					ma[i++] = m++;
 				if (need < avail) {
 					/*
 					 * Return excess pages to fl.  Its
 					 * order [0, oind) queues are empty.
 					 */
 					vm_phys_enq_range(m, avail - need, fl,
 					    1);
 					return (npages);
 				} else if (i == npages)
 					return (npages);
 			}
 		}
 		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
 			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 				alt = vm_phys_free_queues[domain][flind][pind];
 				while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
 				    NULL) {
 					vm_freelist_rem(alt, m, oind);
 					vm_phys_set_pool(pool, m, oind);
 					avail = 1 << oind;
 					need = imin(npages - i, avail);
 					for (end = i + need; i < end;)
 						ma[i++] = m++;
 					if (need < avail) {
 						/*
 						 * Return excess pages to fl.
 						 * Its order [0, oind) queues
 						 * are empty.
 						 */
 						vm_phys_enq_range(m, avail -
 						    need, fl, 1);
 						return (npages);
 					} else if (i == npages)
 						return (npages);
 				}
 			}
 		}
 	}
 	return (i);
 }
 
 /*
  * Allocate a contiguous, power of two-sized set of physical pages
  * from the free lists.
  *
  * The free page queues must be locked.
  */
 vm_page_t
 vm_phys_alloc_pages(int domain, int pool, int order)
 {
 	vm_page_t m;
 	int freelist;
 
 	for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
 		m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order);
 		if (m != NULL)
 			return (m);
 	}
 	return (NULL);
 }
 
 /*
  * Allocate a contiguous, power of two-sized set of physical pages from the
  * specified free list.  The free list must be specified using one of the
  * manifest constants VM_FREELIST_*.
  *
  * The free page queues must be locked.
  */
 vm_page_t
 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
 {
 	struct vm_freelist *alt, *fl;
 	vm_page_t m;
 	int oind, pind, flind;
 
 	KASSERT(domain >= 0 && domain < vm_ndomains,
 	    ("vm_phys_alloc_freelist_pages: domain %d is out of range",
 	    domain));
 	KASSERT(freelist < VM_NFREELIST,
 	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
 	    freelist));
 	KASSERT(pool < VM_NFREEPOOL,
 	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
 	KASSERT(order < VM_NFREEORDER,
 	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
 
 	flind = vm_freelist_to_flind[freelist];
 	/* Check if freelist is present */
 	if (flind < 0)
 		return (NULL);
 
 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
 	fl = &vm_phys_free_queues[domain][flind][pool][0];
 	for (oind = order; oind < VM_NFREEORDER; oind++) {
 		m = TAILQ_FIRST(&fl[oind].pl);
 		if (m != NULL) {
 			vm_freelist_rem(fl, m, oind);
 			/* The order [order, oind) queues are empty. */
 			vm_phys_split_pages(m, oind, fl, order, 1);
 			return (m);
 		}
 	}
 
 	/*
 	 * The given pool was empty.  Find the largest
 	 * contiguous, power-of-two-sized set of pages in any
 	 * pool.  Transfer these pages to the given pool, and
 	 * use them to satisfy the allocation.
 	 */
 	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
 		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 			alt = &vm_phys_free_queues[domain][flind][pind][0];
 			m = TAILQ_FIRST(&alt[oind].pl);
 			if (m != NULL) {
 				vm_freelist_rem(alt, m, oind);
 				vm_phys_set_pool(pool, m, oind);
 				/* The order [order, oind) queues are empty. */
 				vm_phys_split_pages(m, oind, fl, order, 1);
 				return (m);
 			}
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Find the vm_page corresponding to the given physical address.
  */
 vm_page_t
 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
 {
 	struct vm_phys_seg *seg;
 	int segind;
 
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		seg = &vm_phys_segs[segind];
 		if (pa >= seg->start && pa < seg->end)
 			return (&seg->first_page[atop(pa - seg->start)]);
 	}
 	return (NULL);
 }
 
 vm_page_t
 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
 {
 	struct vm_phys_fictitious_seg tmp, *seg;
 	vm_page_t m;
 
 	m = NULL;
 	tmp.start = pa;
 	tmp.end = 0;
 
 	rw_rlock(&vm_phys_fictitious_reg_lock);
 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
 	rw_runlock(&vm_phys_fictitious_reg_lock);
 	if (seg == NULL)
 		return (NULL);
 
 	m = &seg->first_page[atop(pa - seg->start)];
 	KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
 
 	return (m);
 }
 
 static inline void
 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
     long page_count, vm_memattr_t memattr)
 {
 	long i;
 
 	bzero(range, page_count * sizeof(*range));
 	for (i = 0; i < page_count; i++) {
 		vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
 		range[i].oflags &= ~VPO_UNMANAGED;
 		range[i].busy_lock = VPB_UNBUSIED;
 	}
 }
 
 int
 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
     vm_memattr_t memattr)
 {
 	struct vm_phys_fictitious_seg *seg;
 	vm_page_t fp;
 	long page_count;
 #ifdef VM_PHYSSEG_DENSE
 	long pi, pe;
 	long dpage_count;
 #endif
 
 	KASSERT(start < end,
 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
 	    (uintmax_t)start, (uintmax_t)end));
 
 	page_count = (end - start) / PAGE_SIZE;
 
 #ifdef VM_PHYSSEG_DENSE
 	pi = atop(start);
 	pe = atop(end);
 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 		fp = &vm_page_array[pi - first_page];
 		if ((pe - first_page) > vm_page_array_size) {
 			/*
 			 * We have a segment that starts inside
 			 * of vm_page_array, but ends outside of it.
 			 *
 			 * Use vm_page_array pages for those that are
 			 * inside of the vm_page_array range, and
 			 * allocate the remaining ones.
 			 */
 			dpage_count = vm_page_array_size - (pi - first_page);
 			vm_phys_fictitious_init_range(fp, start, dpage_count,
 			    memattr);
 			page_count -= dpage_count;
 			start += ptoa(dpage_count);
 			goto alloc;
 		}
 		/*
 		 * We can allocate the full range from vm_page_array,
 		 * so there's no need to register the range in the tree.
 		 */
 		vm_phys_fictitious_init_range(fp, start, page_count, memattr);
 		return (0);
 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
 		/*
 		 * We have a segment that ends inside of vm_page_array,
 		 * but starts outside of it.
 		 */
 		fp = &vm_page_array[0];
 		dpage_count = pe - first_page;
 		vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
 		    memattr);
 		end -= ptoa(dpage_count);
 		page_count -= dpage_count;
 		goto alloc;
 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
 		/*
 		 * Trying to register a fictitious range that expands before
 		 * and after vm_page_array.
 		 */
 		return (EINVAL);
 	} else {
 alloc:
 #endif
 		fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
 		    M_WAITOK);
 #ifdef VM_PHYSSEG_DENSE
 	}
 #endif
 	vm_phys_fictitious_init_range(fp, start, page_count, memattr);
 
 	seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
 	seg->start = start;
 	seg->end = end;
 	seg->first_page = fp;
 
 	rw_wlock(&vm_phys_fictitious_reg_lock);
 	RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
 	rw_wunlock(&vm_phys_fictitious_reg_lock);
 
 	return (0);
 }
 
 void
 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
 {
 	struct vm_phys_fictitious_seg *seg, tmp;
 #ifdef VM_PHYSSEG_DENSE
 	long pi, pe;
 #endif
 
 	KASSERT(start < end,
 	    ("Start of segment isn't less than end (start: %jx end: %jx)",
 	    (uintmax_t)start, (uintmax_t)end));
 
 #ifdef VM_PHYSSEG_DENSE
 	pi = atop(start);
 	pe = atop(end);
 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 		if ((pe - first_page) <= vm_page_array_size) {
 			/*
 			 * This segment was allocated using vm_page_array
 			 * only, there's nothing to do since those pages
 			 * were never added to the tree.
 			 */
 			return;
 		}
 		/*
 		 * We have a segment that starts inside
 		 * of vm_page_array, but ends outside of it.
 		 *
 		 * Calculate how many pages were added to the
 		 * tree and free them.
 		 */
 		start = ptoa(first_page + vm_page_array_size);
 	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
 		/*
 		 * We have a segment that ends inside of vm_page_array,
 		 * but starts outside of it.
 		 */
 		end = ptoa(first_page);
 	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
 		/* Since it's not possible to register such a range, panic. */
 		panic(
 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
 		    (uintmax_t)start, (uintmax_t)end);
 	}
 #endif
 	tmp.start = start;
 	tmp.end = 0;
 
 	rw_wlock(&vm_phys_fictitious_reg_lock);
 	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
 	if (seg->start != start || seg->end != end) {
 		rw_wunlock(&vm_phys_fictitious_reg_lock);
 		panic(
 		    "Unregistering not registered fictitious range [%#jx:%#jx]",
 		    (uintmax_t)start, (uintmax_t)end);
 	}
 	RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
 	rw_wunlock(&vm_phys_fictitious_reg_lock);
 	free(seg->first_page, M_FICT_PAGES);
 	free(seg, M_FICT_PAGES);
 }
 
 /*
  * Free a contiguous, power of two-sized set of physical pages.
  *
  * The free page queues must be locked.
  */
 void
 vm_phys_free_pages(vm_page_t m, int order)
 {
 	struct vm_freelist *fl;
 	struct vm_phys_seg *seg;
 	vm_paddr_t pa;
 	vm_page_t m_buddy;
 
 	KASSERT(m->order == VM_NFREEORDER,
 	    ("vm_phys_free_pages: page %p has unexpected order %d",
 	    m, m->order));
 	KASSERT(m->pool < VM_NFREEPOOL,
 	    ("vm_phys_free_pages: page %p has unexpected pool %d",
 	    m, m->pool));
 	KASSERT(order < VM_NFREEORDER,
 	    ("vm_phys_free_pages: order %d is out of range", order));
 	seg = &vm_phys_segs[m->segind];
 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
 	if (order < VM_NFREEORDER - 1) {
 		pa = VM_PAGE_TO_PHYS(m);
 		do {
 			pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
 			if (pa < seg->start || pa >= seg->end)
 				break;
 			m_buddy = &seg->first_page[atop(pa - seg->start)];
 			if (m_buddy->order != order)
 				break;
 			fl = (*seg->free_queues)[m_buddy->pool];
 			vm_freelist_rem(fl, m_buddy, order);
 			if (m_buddy->pool != m->pool)
 				vm_phys_set_pool(m->pool, m_buddy, order);
 			order++;
 			pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
 			m = &seg->first_page[atop(pa - seg->start)];
 		} while (order < VM_NFREEORDER - 1);
 	}
 	fl = (*seg->free_queues)[m->pool];
 	vm_freelist_add(fl, m, order, 1);
 }
 
 /*
  * Free a contiguous, arbitrarily sized set of physical pages.
  *
  * The free page queues must be locked.
  */
 void
 vm_phys_free_contig(vm_page_t m, u_long npages)
 {
 	u_int n;
 	int order;
 
 	/*
 	 * Avoid unnecessary coalescing by freeing the pages in the largest
 	 * possible power-of-two-sized subsets.
 	 */
 	vm_domain_free_assert_locked(vm_pagequeue_domain(m));
 	for (;; npages -= n) {
 		/*
 		 * Unsigned "min" is used here so that "order" is assigned
 		 * "VM_NFREEORDER - 1" when "m"'s physical address is zero
 		 * or the low-order bits of its physical address are zero
 		 * because the size of a physical address exceeds the size of
 		 * a long.
 		 */
 		order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1,
 		    VM_NFREEORDER - 1);
 		n = 1 << order;
 		if (npages < n)
 			break;
 		vm_phys_free_pages(m, order);
 		m += n;
 	}
 	/* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */
 	for (; npages > 0; npages -= n) {
 		order = flsl(npages) - 1;
 		n = 1 << order;
 		vm_phys_free_pages(m, order);
 		m += n;
 	}
 }
 
 /*
  * Scan physical memory between the specified addresses "low" and "high" for a
  * run of contiguous physical pages that satisfy the specified conditions, and
  * return the lowest page in the run.  The specified "alignment" determines
  * the alignment of the lowest physical page in the run.  If the specified
  * "boundary" is non-zero, then the run of physical pages cannot span a
  * physical address that is a multiple of "boundary".
  *
  * "npages" must be greater than zero.  Both "alignment" and "boundary" must
  * be a power of two.
  */
 vm_page_t
 vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary, int options)
 {
 	vm_paddr_t pa_end;
 	vm_page_t m_end, m_run, m_start;
 	struct vm_phys_seg *seg;
 	int segind;
 
 	KASSERT(npages > 0, ("npages is 0"));
 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 	if (low >= high)
 		return (NULL);
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		seg = &vm_phys_segs[segind];
 		if (seg->domain != domain)
 			continue;
 		if (seg->start >= high)
 			break;
 		if (low >= seg->end)
 			continue;
 		if (low <= seg->start)
 			m_start = seg->first_page;
 		else
 			m_start = &seg->first_page[atop(low - seg->start)];
 		if (high < seg->end)
 			pa_end = high;
 		else
 			pa_end = seg->end;
 		if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages))
 			continue;
 		m_end = &seg->first_page[atop(pa_end - seg->start)];
 		m_run = vm_page_scan_contig(npages, m_start, m_end,
 		    alignment, boundary, options);
 		if (m_run != NULL)
 			return (m_run);
 	}
 	return (NULL);
 }
 
 /*
  * Set the pool for a contiguous, power of two-sized set of physical pages. 
  */
 void
 vm_phys_set_pool(int pool, vm_page_t m, int order)
 {
 	vm_page_t m_tmp;
 
 	for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
 		m_tmp->pool = pool;
 }
 
 /*
  * Search for the given physical page "m" in the free lists.  If the search
  * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
  * FALSE, indicating that "m" is not in the free lists.
  *
  * The free page queues must be locked.
  */
 boolean_t
 vm_phys_unfree_page(vm_page_t m)
 {
 	struct vm_freelist *fl;
 	struct vm_phys_seg *seg;
 	vm_paddr_t pa, pa_half;
 	vm_page_t m_set, m_tmp;
 	int order;
 
 	/*
 	 * First, find the contiguous, power of two-sized set of free
 	 * physical pages containing the given physical page "m" and
 	 * assign it to "m_set".
 	 */
 	seg = &vm_phys_segs[m->segind];
 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
 	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
 	    order < VM_NFREEORDER - 1; ) {
 		order++;
 		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
 		if (pa >= seg->start)
 			m_set = &seg->first_page[atop(pa - seg->start)];
 		else
 			return (FALSE);
 	}
 	if (m_set->order < order)
 		return (FALSE);
 	if (m_set->order == VM_NFREEORDER)
 		return (FALSE);
 	KASSERT(m_set->order < VM_NFREEORDER,
 	    ("vm_phys_unfree_page: page %p has unexpected order %d",
 	    m_set, m_set->order));
 
 	/*
 	 * Next, remove "m_set" from the free lists.  Finally, extract
 	 * "m" from "m_set" using an iterative algorithm: While "m_set"
 	 * is larger than a page, shrink "m_set" by returning the half
 	 * of "m_set" that does not contain "m" to the free lists.
 	 */
 	fl = (*seg->free_queues)[m_set->pool];
 	order = m_set->order;
 	vm_freelist_rem(fl, m_set, order);
 	while (order > 0) {
 		order--;
 		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
 		if (m->phys_addr < pa_half)
 			m_tmp = &seg->first_page[atop(pa_half - seg->start)];
 		else {
 			m_tmp = m_set;
 			m_set = &seg->first_page[atop(pa_half - seg->start)];
 		}
 		vm_freelist_add(fl, m_tmp, order, 0);
 	}
 	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
 	return (TRUE);
 }
 
 /*
  * Allocate a contiguous set of physical pages of the given size
  * "npages" from the free lists.  All of the physical pages must be at
  * or above the given physical address "low" and below the given
  * physical address "high".  The given value "alignment" determines the
  * alignment of the first physical page in the set.  If the given value
  * "boundary" is non-zero, then the set of physical pages cannot cross
  * any physical address boundary that is a multiple of that value.  Both
  * "alignment" and "boundary" must be a power of two.
  */
 vm_page_t
 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary)
 {
 	vm_paddr_t pa_end, pa_start;
 	vm_page_t m_run;
 	struct vm_phys_seg *seg;
 	int segind;
 
 	KASSERT(npages > 0, ("npages is 0"));
 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
 	if (low >= high)
 		return (NULL);
 	m_run = NULL;
 	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
 		seg = &vm_phys_segs[segind];
 		if (seg->start >= high || seg->domain != domain)
 			continue;
 		if (low >= seg->end)
 			break;
 		if (low <= seg->start)
 			pa_start = seg->start;
 		else
 			pa_start = low;
 		if (high < seg->end)
 			pa_end = high;
 		else
 			pa_end = seg->end;
 		if (pa_end - pa_start < ptoa(npages))
 			continue;
 		m_run = vm_phys_alloc_seg_contig(seg, npages, low, high,
 		    alignment, boundary);
 		if (m_run != NULL)
 			break;
 	}
 	return (m_run);
 }
 
 /*
  * Allocate a run of contiguous physical pages from the free list for the
  * specified segment.
  */
 static vm_page_t
 vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages,
     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 {
 	struct vm_freelist *fl;
 	vm_paddr_t pa, pa_end, size;
 	vm_page_t m, m_ret;
 	u_long npages_end;
 	int oind, order, pind;
 
 	KASSERT(npages > 0, ("npages is 0"));
 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
 	/* Compute the queue that is the best fit for npages. */
 	order = flsl(npages - 1);
 	/* Search for a run satisfying the specified conditions. */
 	size = npages << PAGE_SHIFT;
 	for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER;
 	    oind++) {
 		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 			fl = (*seg->free_queues)[pind];
 			TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
 				/*
 				 * Is the size of this allocation request
 				 * larger than the largest block size?
 				 */
 				if (order >= VM_NFREEORDER) {
 					/*
 					 * Determine if a sufficient number of
 					 * subsequent blocks to satisfy the
 					 * allocation request are free.
 					 */
 					pa = VM_PAGE_TO_PHYS(m_ret);
 					pa_end = pa + size;
 					if (pa_end < pa)
 						continue;
 					for (;;) {
 						pa += 1 << (PAGE_SHIFT +
 						    VM_NFREEORDER - 1);
 						if (pa >= pa_end ||
 						    pa < seg->start ||
 						    pa >= seg->end)
 							break;
 						m = &seg->first_page[atop(pa -
 						    seg->start)];
 						if (m->order != VM_NFREEORDER -
 						    1)
 							break;
 					}
 					/* If not, go to the next block. */
 					if (pa < pa_end)
 						continue;
 				}
 
 				/*
 				 * Determine if the blocks are within the
 				 * given range, satisfy the given alignment,
 				 * and do not cross the given boundary.
 				 */
 				pa = VM_PAGE_TO_PHYS(m_ret);
 				pa_end = pa + size;
 				if (pa >= low && pa_end <= high &&
 				    (pa & (alignment - 1)) == 0 &&
 				    rounddown2(pa ^ (pa_end - 1), boundary) == 0)
 					goto done;
 			}
 		}
 	}
 	return (NULL);
 done:
 	for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
 		fl = (*seg->free_queues)[m->pool];
 		vm_freelist_rem(fl, m, oind);
 		if (m->pool != VM_FREEPOOL_DEFAULT)
 			vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind);
 	}
 	/* Return excess pages to the free lists. */
 	npages_end = roundup2(npages, 1 << oind);
 	if (npages < npages_end) {
 		fl = (*seg->free_queues)[VM_FREEPOOL_DEFAULT];
 		vm_phys_enq_range(&m_ret[npages], npages_end - npages, fl, 0);
 	}
 	return (m_ret);
 }
 
 #ifdef DDB
 /*
  * Show the number of physical pages in each of the free lists.
  */
 DB_SHOW_COMMAND(freepages, db_show_freepages)
 {
 	struct vm_freelist *fl;
 	int flind, oind, pind, dom;
 
 	for (dom = 0; dom < vm_ndomains; dom++) {
 		db_printf("DOMAIN: %d\n", dom);
 		for (flind = 0; flind < vm_nfreelists; flind++) {
 			db_printf("FREE LIST %d:\n"
 			    "\n  ORDER (SIZE)  |  NUMBER"
 			    "\n              ", flind);
 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
 				db_printf("  |  POOL %d", pind);
 			db_printf("\n--            ");
 			for (pind = 0; pind < VM_NFREEPOOL; pind++)
 				db_printf("-- --      ");
 			db_printf("--\n");
 			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
 				db_printf("  %2.2d (%6.6dK)", oind,
 				    1 << (PAGE_SHIFT - 10 + oind));
 				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
 				fl = vm_phys_free_queues[dom][flind][pind];
 					db_printf("  |  %6.6d", fl[oind].lcnt);
 				}
 				db_printf("\n");
 			}
 			db_printf("\n");
 		}
 		db_printf("\n");
 	}
 }
 #endif
Index: head/sys/x86/acpica/srat.c
===================================================================
--- head/sys/x86/acpica/srat.c	(revision 339615)
+++ head/sys/x86/acpica/srat.c	(revision 339616)
@@ -1,582 +1,578 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2010 Hudson River Trading LLC
  * Written by: John H. Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/smp.h>
 #include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_param.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 
 #include <contrib/dev/acpica/include/acpi.h>
 #include <contrib/dev/acpica/include/aclocal.h>
 #include <contrib/dev/acpica/include/actables.h>
 
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 #include <x86/apicvar.h>
 
 #include <dev/acpica/acpivar.h>
 
 #if MAXMEMDOM > 1
 static struct cpu_info {
 	int enabled:1;
 	int has_memory:1;
 	int domain;
 } *cpus;
 
 struct mem_affinity mem_info[VM_PHYSSEG_MAX + 1];
 int num_mem;
 
 static ACPI_TABLE_SRAT *srat;
 static vm_paddr_t srat_physaddr;
 
 static int domain_pxm[MAXMEMDOM];
 static int ndomain;
 
 static ACPI_TABLE_SLIT *slit;
 static vm_paddr_t slit_physaddr;
 static int vm_locality_table[MAXMEMDOM * MAXMEMDOM];
 
 static void	srat_walk_table(acpi_subtable_handler *handler, void *arg);
 
 /*
  * SLIT parsing.
  */
 
 static void
 slit_parse_table(ACPI_TABLE_SLIT *s)
 {
 	int i, j;
 	int i_domain, j_domain;
 	int offset = 0;
 	uint8_t e;
 
 	/*
 	 * This maps the SLIT data into the VM-domain centric view.
 	 * There may be sparse entries in the PXM namespace, so
 	 * remap them to a VM-domain ID and if it doesn't exist,
 	 * skip it.
 	 *
 	 * It should result in a packed 2d array of VM-domain
 	 * locality information entries.
 	 */
 
 	if (bootverbose)
 		printf("SLIT.Localities: %d\n", (int) s->LocalityCount);
 	for (i = 0; i < s->LocalityCount; i++) {
 		i_domain = acpi_map_pxm_to_vm_domainid(i);
 		if (i_domain < 0)
 			continue;
 
 		if (bootverbose)
 			printf("%d: ", i);
 		for (j = 0; j < s->LocalityCount; j++) {
 			j_domain = acpi_map_pxm_to_vm_domainid(j);
 			if (j_domain < 0)
 				continue;
 			e = s->Entry[i * s->LocalityCount + j];
 			if (bootverbose)
 				printf("%d ", (int) e);
 			/* 255 == "no locality information" */
 			if (e == 255)
 				vm_locality_table[offset] = -1;
 			else
 				vm_locality_table[offset] = e;
 			offset++;
 		}
 		if (bootverbose)
 			printf("\n");
 	}
 }
 
 /*
  * Look for an ACPI System Locality Distance Information Table ("SLIT")
  */
 static int
 parse_slit(void)
 {
 
 	if (resource_disabled("slit", 0)) {
 		return (-1);
 	}
 
 	slit_physaddr = acpi_find_table(ACPI_SIG_SLIT);
 	if (slit_physaddr == 0) {
 		return (-1);
 	}
 
 	/*
 	 * Make a pass over the table to populate the cpus[] and
 	 * mem_info[] tables.
 	 */
 	slit = acpi_map_table(slit_physaddr, ACPI_SIG_SLIT);
 	slit_parse_table(slit);
 	acpi_unmap_table(slit);
 	slit = NULL;
 
 	return (0);
 }
 
 /*
  * SRAT parsing.
  */
 
 /*
  * Returns true if a memory range overlaps with at least one range in
  * phys_avail[].
  */
 static int
 overlaps_phys_avail(vm_paddr_t start, vm_paddr_t end)
 {
 	int i;
 
 	for (i = 0; phys_avail[i] != 0 && phys_avail[i + 1] != 0; i += 2) {
 		if (phys_avail[i + 1] <= start)
 			continue;
 		if (phys_avail[i] < end)
 			return (1);
 		break;
 	}
 	return (0);
 	
 }
 
 static void
 srat_parse_entry(ACPI_SUBTABLE_HEADER *entry, void *arg)
 {
 	ACPI_SRAT_CPU_AFFINITY *cpu;
 	ACPI_SRAT_X2APIC_CPU_AFFINITY *x2apic;
 	ACPI_SRAT_MEM_AFFINITY *mem;
 	int domain, i, slot;
 
 	switch (entry->Type) {
 	case ACPI_SRAT_TYPE_CPU_AFFINITY:
 		cpu = (ACPI_SRAT_CPU_AFFINITY *)entry;
 		domain = cpu->ProximityDomainLo |
 		    cpu->ProximityDomainHi[0] << 8 |
 		    cpu->ProximityDomainHi[1] << 16 |
 		    cpu->ProximityDomainHi[2] << 24;
 		if (bootverbose)
 			printf("SRAT: Found CPU APIC ID %u domain %d: %s\n",
 			    cpu->ApicId, domain,
 			    (cpu->Flags & ACPI_SRAT_CPU_ENABLED) ?
 			    "enabled" : "disabled");
 		if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED))
 			break;
 		if (cpu->ApicId > max_apic_id) {
 			printf("SRAT: Ignoring local APIC ID %u (too high)\n",
 			    cpu->ApicId);
 			break;
 		}
 
 		if (cpus[cpu->ApicId].enabled) {
 			printf("SRAT: Duplicate local APIC ID %u\n",
 			    cpu->ApicId);
 			*(int *)arg = ENXIO;
 			break;
 		}
 		cpus[cpu->ApicId].domain = domain;
 		cpus[cpu->ApicId].enabled = 1;
 		break;
 	case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY:
 		x2apic = (ACPI_SRAT_X2APIC_CPU_AFFINITY *)entry;
 		if (bootverbose)
 			printf("SRAT: Found CPU APIC ID %u domain %d: %s\n",
 			    x2apic->ApicId, x2apic->ProximityDomain,
 			    (x2apic->Flags & ACPI_SRAT_CPU_ENABLED) ?
 			    "enabled" : "disabled");
 		if (!(x2apic->Flags & ACPI_SRAT_CPU_ENABLED))
 			break;
 		if (x2apic->ApicId > max_apic_id) {
 			printf("SRAT: Ignoring local APIC ID %u (too high)\n",
 			    x2apic->ApicId);
 			break;
 		}
 
 		KASSERT(!cpus[x2apic->ApicId].enabled,
 		    ("Duplicate local APIC ID %u", x2apic->ApicId));
 		cpus[x2apic->ApicId].domain = x2apic->ProximityDomain;
 		cpus[x2apic->ApicId].enabled = 1;
 		break;
 	case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
 		mem = (ACPI_SRAT_MEM_AFFINITY *)entry;
 		if (bootverbose)
 			printf(
 		    "SRAT: Found memory domain %d addr 0x%jx len 0x%jx: %s\n",
 			    mem->ProximityDomain, (uintmax_t)mem->BaseAddress,
 			    (uintmax_t)mem->Length,
 			    (mem->Flags & ACPI_SRAT_MEM_ENABLED) ?
 			    "enabled" : "disabled");
 		if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED))
 			break;
 		if (mem->BaseAddress >= cpu_getmaxphyaddr() || 
 		    !overlaps_phys_avail(mem->BaseAddress,
 		    mem->BaseAddress + mem->Length)) {
 			printf("SRAT: Ignoring memory at addr 0x%jx\n",
 			    (uintmax_t)mem->BaseAddress);
 			break;
 		}
 		if (num_mem == VM_PHYSSEG_MAX) {
 			printf("SRAT: Too many memory regions\n");
 			*(int *)arg = ENXIO;
 			break;
 		}
 		slot = num_mem;
 		for (i = 0; i < num_mem; i++) {
 			if (mem_info[i].end <= mem->BaseAddress)
 				continue;
 			if (mem_info[i].start <
 			    (mem->BaseAddress + mem->Length)) {
 				printf("SRAT: Overlapping memory entries\n");
 				*(int *)arg = ENXIO;
 				return;
 			}
 			slot = i;
 		}
 		for (i = num_mem; i > slot; i--)
 			mem_info[i] = mem_info[i - 1];
 		mem_info[slot].start = mem->BaseAddress;
 		mem_info[slot].end = mem->BaseAddress + mem->Length;
 		mem_info[slot].domain = mem->ProximityDomain;
 		num_mem++;
 		break;
 	}
 }
 
 /*
  * Ensure each memory domain has at least one CPU and that each CPU
  * has at least one memory domain.
  */
 static int
 check_domains(void)
 {
 	int found, i, j;
 
 	for (i = 0; i < num_mem; i++) {
 		found = 0;
 		for (j = 0; j <= max_apic_id; j++)
 			if (cpus[j].enabled &&
 			    cpus[j].domain == mem_info[i].domain) {
 				cpus[j].has_memory = 1;
 				found++;
 			}
 		if (!found) {
 			printf("SRAT: No CPU found for memory domain %d\n",
 			    mem_info[i].domain);
 			return (ENXIO);
 		}
 	}
 	for (i = 0; i <= max_apic_id; i++)
 		if (cpus[i].enabled && !cpus[i].has_memory) {
 			found = 0;
 			for (j = 0; j < num_mem && !found; j++) {
 				if (mem_info[j].domain == cpus[i].domain)
 					found = 1;
 			}
 			if (!found) {
 				if (bootverbose)
 					printf("SRAT: mem dom %d is empty\n",
 					    cpus[i].domain);
 				mem_info[num_mem].start = 0;
 				mem_info[num_mem].end = 0;
 				mem_info[num_mem].domain = cpus[i].domain;
 				num_mem++;
 			}
 		}
 	return (0);
 }
 
 /*
  * Check that the SRAT memory regions cover all of the regions in
  * phys_avail[].
  */
 static int
 check_phys_avail(void)
 {
 	vm_paddr_t address;
 	int i, j;
 
 	/* j is the current offset into phys_avail[]. */
 	address = phys_avail[0];
 	j = 0;
 	for (i = 0; i < num_mem; i++) {
 		/*
 		 * Consume as many phys_avail[] entries as fit in this
 		 * region.
 		 */
 		while (address >= mem_info[i].start &&
 		    address <= mem_info[i].end) {
 			/*
 			 * If we cover the rest of this phys_avail[] entry,
 			 * advance to the next entry.
 			 */
 			if (phys_avail[j + 1] <= mem_info[i].end) {
 				j += 2;
 				if (phys_avail[j] == 0 &&
 				    phys_avail[j + 1] == 0) {
 					return (0);
 				}
 				address = phys_avail[j];
 			} else
 				address = mem_info[i].end + 1;
 		}
 	}
 	printf("SRAT: No memory region found for 0x%jx - 0x%jx\n",
 	    (uintmax_t)phys_avail[j], (uintmax_t)phys_avail[j + 1]);
 	return (ENXIO);
 }
 
 /*
  * Renumber the memory domains to be compact and zero-based if not
  * already.  Returns an error if there are too many domains.
  */
 static int
 renumber_domains(void)
 {
 	int i, j, slot;
 
 	/* Enumerate all the domains. */
 	ndomain = 0;
 	for (i = 0; i < num_mem; i++) {
 		/* See if this domain is already known. */
 		for (j = 0; j < ndomain; j++) {
 			if (domain_pxm[j] >= mem_info[i].domain)
 				break;
 		}
 		if (j < ndomain && domain_pxm[j] == mem_info[i].domain)
 			continue;
 
 		if (ndomain >= MAXMEMDOM) {
 			ndomain = 1;
 			printf("SRAT: Too many memory domains\n");
 			return (EFBIG);
 		}
 
 		/* Insert the new domain at slot 'j'. */
 		slot = j;
 		for (j = ndomain; j > slot; j--)
 			domain_pxm[j] = domain_pxm[j - 1];
 		domain_pxm[slot] = mem_info[i].domain;
 		ndomain++;
 	}
 
 	/* Renumber each domain to its index in the sorted 'domain_pxm' list. */
 	for (i = 0; i < ndomain; i++) {
 		/*
 		 * If the domain is already the right value, no need
 		 * to renumber.
 		 */
 		if (domain_pxm[i] == i)
 			continue;
 
 		/* Walk the cpu[] and mem_info[] arrays to renumber. */
 		for (j = 0; j < num_mem; j++)
 			if (mem_info[j].domain == domain_pxm[i])
 				mem_info[j].domain = i;
 		for (j = 0; j <= max_apic_id; j++)
 			if (cpus[j].enabled && cpus[j].domain == domain_pxm[i])
 				cpus[j].domain = i;
 	}
 
 	return (0);
 }
 
 /*
  * Look for an ACPI System Resource Affinity Table ("SRAT")
  */
 static int
 parse_srat(void)
 {
 	unsigned int idx, size;
 	vm_paddr_t addr;
 	int error;
 
 	if (resource_disabled("srat", 0))
 		return (-1);
 
 	srat_physaddr = acpi_find_table(ACPI_SIG_SRAT);
 	if (srat_physaddr == 0)
 		return (-1);
 
 	/*
 	 * Allocate data structure:
 	 *
 	 * Find the last physical memory region and steal some memory from
 	 * it. This is done because at this point in the boot process
 	 * malloc is still not usable.
 	 */
 	for (idx = 0; phys_avail[idx + 1] != 0; idx += 2);
 	KASSERT(idx != 0, ("phys_avail is empty!"));
 	idx -= 2;
 
 	size =  sizeof(*cpus) * (max_apic_id + 1);
 	addr = trunc_page(phys_avail[idx + 1] - size);
 	KASSERT(addr >= phys_avail[idx],
 	    ("Not enough memory for SRAT table items"));
 	phys_avail[idx + 1] = addr - 1;
 
 	/*
 	 * We cannot rely on PHYS_TO_DMAP because this code is also used in
 	 * i386, so use pmap_mapbios to map the memory, this will end up using
 	 * the default memory attribute (WB), and the DMAP when available.
 	 */
 	cpus = (struct cpu_info *)pmap_mapbios(addr, size);
 	bzero(cpus, size);
 
 	/*
 	 * Make a pass over the table to populate the cpus[] and
 	 * mem_info[] tables.
 	 */
 	srat = acpi_map_table(srat_physaddr, ACPI_SIG_SRAT);
 	error = 0;
 	srat_walk_table(srat_parse_entry, &error);
 	acpi_unmap_table(srat);
 	srat = NULL;
 	if (error || check_domains() != 0 || check_phys_avail() != 0 ||
 	    renumber_domains() != 0) {
 		srat_physaddr = 0;
 		return (-1);
 	}
 
 	return (0);
 }
 
 static void
 init_mem_locality(void)
 {
 	int i;
 
 	/*
 	 * For now, assume -1 == "no locality information for
 	 * this pairing.
 	 */
 	for (i = 0; i < MAXMEMDOM * MAXMEMDOM; i++)
 		vm_locality_table[i] = -1;
 }
 
 static void
 parse_acpi_tables(void *dummy)
 {
 
 	if (parse_srat() < 0)
 		return;
 	init_mem_locality();
 	(void)parse_slit();
 	vm_phys_register_domains(ndomain, mem_info, vm_locality_table);
 }
 SYSINIT(parse_acpi_tables, SI_SUB_VM - 1, SI_ORDER_FIRST, parse_acpi_tables,
     NULL);
 
 static void
 srat_walk_table(acpi_subtable_handler *handler, void *arg)
 {
 
 	acpi_walk_subtables(srat + 1, (char *)srat + srat->Header.Length,
 	    handler, arg);
 }
 
 /*
  * Setup per-CPU domain IDs.
  */
 static void
 srat_set_cpus(void *dummy)
 {
 	struct cpu_info *cpu;
 	struct pcpu *pc;
 	u_int i;
 
 	if (srat_physaddr == 0)
 		return;
 	for (i = 0; i < MAXCPU; i++) {
 		if (CPU_ABSENT(i))
 			continue;
 		pc = pcpu_find(i);
 		KASSERT(pc != NULL, ("no pcpu data for CPU %u", i));
 		cpu = &cpus[pc->pc_apic_id];
 		if (!cpu->enabled)
 			panic("SRAT: CPU with APIC ID %u is not known",
 			    pc->pc_apic_id);
-#ifdef NUMA
-		pc->pc_domain = cpu->domain;
-#else
-		pc->pc_domain = 0;
-#endif
+		pc->pc_domain = vm_ndomains > 1 ? cpu->domain : 0;
 		CPU_SET(i, &cpuset_domain[pc->pc_domain]);
 		if (bootverbose)
 			printf("SRAT: CPU %u has memory domain %d\n", i,
 			    pc->pc_domain);
 	}
 
 	/* Last usage of the cpus array, unmap it. */
 	pmap_unmapbios((vm_offset_t)cpus, sizeof(*cpus) * (max_apic_id + 1));
 	cpus = NULL;
 }
 SYSINIT(srat_set_cpus, SI_SUB_CPU, SI_ORDER_ANY, srat_set_cpus, NULL);
 
 /*
  * Map a _PXM value to a VM domain ID.
  *
  * Returns the domain ID, or -1 if no domain ID was found.
  */
 int
 acpi_map_pxm_to_vm_domainid(int pxm)
 {
 	int i;
 
 	for (i = 0; i < ndomain; i++) {
 		if (domain_pxm[i] == pxm)
-			return (i);
+			return (vm_ndomains > 1 ? i : 0);
 	}
 
 	return (-1);
 }
 
 #else /* MAXMEMDOM == 1 */
 
 int
 acpi_map_pxm_to_vm_domainid(int pxm)
 {
 
 	return (-1);
 }
 
 #endif /* MAXMEMDOM > 1 */