diff --git a/lib/libthr/thread/thr_attr.c b/lib/libthr/thread/thr_attr.c
index 6ff23aa5a3da..5a06f793f4f8 100644
--- a/lib/libthr/thread/thr_attr.c
+++ b/lib/libthr/thread/thr_attr.c
@@ -1,678 +1,680 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 2003 Craig Rodrigues <rodrigc@attbi.com>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Craig Rodrigues.
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY CRAIG RODRIGUES AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * Copyright (c) 1998 Daniel Eischen <eischen@vigrid.com>.
  * Copyright (C) 2001 Jason Evans <jasone@freebsd.org>.
  * Copyright (c) 2002,2003 Alexey Zelkin <phantom@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer
  *    unmodified other than the allowable addition of one or more
  *    copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * Copyright (c) 1996 John Birrell <jb@cimlogic.com.au>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "namespace.h"
 #include <errno.h>
 #include <pthread.h>
 #include <stdlib.h>
 #include <string.h>
 #include <pthread_np.h>
 #include <sys/sysctl.h>
 #include "un-namespace.h"
 
 #include "thr_private.h"
 
 static size_t	_get_kern_cpuset_size(void);
 
 __weak_reference(_thr_attr_destroy, _pthread_attr_destroy);
 __weak_reference(_thr_attr_destroy, pthread_attr_destroy);
 
 int
 _thr_attr_destroy(pthread_attr_t *attr)
 {
 	int	ret;
 
 	/* Check for invalid arguments: */
 	if (attr == NULL || *attr == NULL)
 		/* Invalid argument: */
 		ret = EINVAL;
 	else {
 		if ((*attr)->cpuset != NULL)
 			free((*attr)->cpuset);
 		/* Free the memory allocated to the attribute object: */
 		free(*attr);
 
 		/*
 		 * Leave the attribute pointer NULL now that the memory
 		 * has been freed:
 		 */
 		*attr = NULL;
 		ret = 0;
 	}
 	return (ret);
 }
 
 __weak_reference(_thr_attr_get_np, pthread_attr_get_np);
 __weak_reference(_thr_attr_get_np, _pthread_attr_get_np);
 
 int
 _thr_attr_get_np(pthread_t pthread, pthread_attr_t *dstattr)
 {
 	struct pthread *curthread;
 	struct pthread_attr attr, *dst;
 	int	ret;
 	size_t	kern_size;
 
 	if (pthread == NULL || dstattr == NULL || (dst = *dstattr) == NULL)
 		return (EINVAL);
 	kern_size = _get_kern_cpuset_size();
 	if (dst->cpuset == NULL) {
 		dst->cpuset = calloc(1, kern_size);
 		dst->cpusetsize = kern_size;
 	}
 	curthread = _get_curthread();
 	if ((ret = _thr_find_thread(curthread, pthread, /*include dead*/0)) != 0)
 		return (ret);
 	attr = pthread->attr;
 	if (pthread->flags & THR_FLAGS_DETACHED)
 		attr.flags |= PTHREAD_DETACHED;
 	ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, TID(pthread),
 		dst->cpusetsize, dst->cpuset);
 	if (ret == -1)
 		ret = errno;
 	THR_THREAD_UNLOCK(curthread, pthread);
 	if (ret == 0) {
 		memcpy(&dst->pthread_attr_start_copy, 
 			&attr.pthread_attr_start_copy, 
 			offsetof(struct pthread_attr, pthread_attr_end_copy) -
 			offsetof(struct pthread_attr, pthread_attr_start_copy));
 	}
 	return (ret);
 }
 
 __weak_reference(_thr_attr_getdetachstate, pthread_attr_getdetachstate);
 __weak_reference(_thr_attr_getdetachstate, _pthread_attr_getdetachstate);
 
 int
 _thr_attr_getdetachstate(const pthread_attr_t *attr, int *detachstate)
 {
 	int	ret;
 
 	/* Check for invalid arguments: */
 	if (attr == NULL || *attr == NULL || detachstate == NULL)
 		ret = EINVAL;
 	else {
 		/* Check if the detached flag is set: */
 		if ((*attr)->flags & PTHREAD_DETACHED)
 			/* Return detached: */
 			*detachstate = PTHREAD_CREATE_DETACHED;
 		else
 			/* Return joinable: */
 			*detachstate = PTHREAD_CREATE_JOINABLE;
 		ret = 0;
 	}
 	return (ret);
 }
 
 __weak_reference(_thr_attr_getguardsize, pthread_attr_getguardsize);
 __weak_reference(_thr_attr_getguardsize, _pthread_attr_getguardsize);
 
 int
 _thr_attr_getguardsize(const pthread_attr_t * __restrict attr,
     size_t * __restrict guardsize)
 {
 	int	ret;
 
 	/* Check for invalid arguments: */
 	if (attr == NULL || *attr == NULL || guardsize == NULL)
 		ret = EINVAL;
 	else {
 		/* Return the guard size: */
 		*guardsize = (*attr)->guardsize_attr;
 		ret = 0;
 	}
 	return (ret);
 }
 
 __weak_reference(_thr_attr_getinheritsched, pthread_attr_getinheritsched);
 __weak_reference(_thr_attr_getinheritsched, _pthread_attr_getinheritsched);
 
 int
 _thr_attr_getinheritsched(const pthread_attr_t * __restrict attr,
     int * __restrict sched_inherit)
 {
 	int ret = 0;
 
 	if ((attr == NULL) || (*attr == NULL))
 		ret = EINVAL;
 	else
 		*sched_inherit = (*attr)->sched_inherit;
 
 	return (ret);
 }
 
 __weak_reference(_thr_attr_getschedparam, pthread_attr_getschedparam);
 __weak_reference(_thr_attr_getschedparam, _pthread_attr_getschedparam);
 
 int
 _thr_attr_getschedparam(const pthread_attr_t * __restrict attr,
     struct sched_param * __restrict param)
 {
 	int ret = 0;
 
 	if ((attr == NULL) || (*attr == NULL) || (param == NULL))
 		ret = EINVAL;
 	else
 		param->sched_priority = (*attr)->prio;
 
 	return (ret);
 }
 
 __weak_reference(_thr_attr_getschedpolicy, pthread_attr_getschedpolicy);
 __weak_reference(_thr_attr_getschedpolicy, _pthread_attr_getschedpolicy);
 
 int
 _thr_attr_getschedpolicy(const pthread_attr_t * __restrict attr,
     int * __restrict policy)
 {
 	int ret = 0;
 
 	if ((attr == NULL) || (*attr == NULL) || (policy == NULL))
 		ret = EINVAL;
 	else
 		*policy = (*attr)->sched_policy;
 
 	return (ret);
 }
 
 __weak_reference(_thr_attr_getscope, pthread_attr_getscope);
 __weak_reference(_thr_attr_getscope, _pthread_attr_getscope);
 
 int
 _thr_attr_getscope(const pthread_attr_t * __restrict attr,
     int * __restrict contentionscope)
 {
 	int ret = 0;
 
 	if ((attr == NULL) || (*attr == NULL) || (contentionscope == NULL))
 		/* Return an invalid argument: */
 		ret = EINVAL;
 
 	else
 		*contentionscope = (*attr)->flags & PTHREAD_SCOPE_SYSTEM ?
 		    PTHREAD_SCOPE_SYSTEM : PTHREAD_SCOPE_PROCESS;
 
 	return (ret);
 }
 
 __weak_reference(_pthread_attr_getstack, pthread_attr_getstack);
 
 int
 _pthread_attr_getstack(const pthread_attr_t * __restrict attr,
     void ** __restrict stackaddr, size_t * __restrict stacksize)
 {
 	int     ret;
 
 	/* Check for invalid arguments: */
 	if (attr == NULL || *attr == NULL || stackaddr == NULL
 	    || stacksize == NULL )
 		ret = EINVAL;
 	else {
 		/* Return the stack address and size */
 		*stackaddr = (*attr)->stackaddr_attr;
 		*stacksize = (*attr)->stacksize_attr;
 		ret = 0;
 	}
 	return (ret);
 }
 
 __weak_reference(_thr_attr_getstackaddr, pthread_attr_getstackaddr);
 __weak_reference(_thr_attr_getstackaddr, _pthread_attr_getstackaddr);
 
 int
 _thr_attr_getstackaddr(const pthread_attr_t *attr, void **stackaddr)
 {
 	int	ret;
 
 	/* Check for invalid arguments: */
 	if (attr == NULL || *attr == NULL || stackaddr == NULL)
 		ret = EINVAL;
 	else {
 		/* Return the stack address: */
 		*stackaddr = (*attr)->stackaddr_attr;
 		ret = 0;
 	}
 	return (ret);
 }
 
 __weak_reference(_thr_attr_getstacksize, pthread_attr_getstacksize);
 __weak_reference(_thr_attr_getstacksize, _pthread_attr_getstacksize);
 
 int
 _thr_attr_getstacksize(const pthread_attr_t * __restrict attr,
     size_t * __restrict stacksize)
 {
 	int	ret;
 
 	/* Check for invalid arguments: */
 	if (attr == NULL || *attr == NULL || stacksize  == NULL)
 		ret = EINVAL;
 	else {
 		/* Return the stack size: */
 		*stacksize = (*attr)->stacksize_attr;
 		ret = 0;
 	}
 	return (ret);
 }
 
 __weak_reference(_thr_attr_init, pthread_attr_init);
 __weak_reference(_thr_attr_init, _pthread_attr_init);
 
 int
 _thr_attr_init(pthread_attr_t *attr)
 {
 	int	ret;
 	pthread_attr_t	pattr;
 
 	_thr_check_init();
 
 	/* Allocate memory for the attribute object: */
 	if ((pattr = (pthread_attr_t) malloc(sizeof(struct pthread_attr))) == NULL)
 		/* Insufficient memory: */
 		ret = ENOMEM;
 	else {
 		/* Initialise the attribute object with the defaults: */
 		memcpy(pattr, &_pthread_attr_default, sizeof(struct pthread_attr));
 
 		/* Return a pointer to the attribute object: */
 		*attr = pattr;
 		ret = 0;
 	}
 	return (ret);
 }
 
 __weak_reference(_pthread_attr_setcreatesuspend_np, pthread_attr_setcreatesuspend_np);
 
 int
 _pthread_attr_setcreatesuspend_np(pthread_attr_t *attr)
 {
 	int	ret;
 
 	if (attr == NULL || *attr == NULL) {
 		ret = EINVAL;
 	} else {
 		(*attr)->suspend = THR_CREATE_SUSPENDED;
 		ret = 0;
 	}
 	return (ret);
 }
 
 __weak_reference(_thr_attr_setdetachstate, pthread_attr_setdetachstate);
 __weak_reference(_thr_attr_setdetachstate, _pthread_attr_setdetachstate);
 
 int
 _thr_attr_setdetachstate(pthread_attr_t *attr, int detachstate)
 {
 	int	ret;
 
 	/* Check for invalid arguments: */
 	if (attr == NULL || *attr == NULL ||
 	    (detachstate != PTHREAD_CREATE_DETACHED &&
 	    detachstate != PTHREAD_CREATE_JOINABLE))
 		ret = EINVAL;
 	else {
 		/* Check if detached state: */
 		if (detachstate == PTHREAD_CREATE_DETACHED)
 			/* Set the detached flag: */
 			(*attr)->flags |= PTHREAD_DETACHED;
 		else
 			/* Reset the detached flag: */
 			(*attr)->flags &= ~PTHREAD_DETACHED;
 		ret = 0;
 	}
 	return (ret);
 }
 
 __weak_reference(_thr_attr_setguardsize, pthread_attr_setguardsize);
 __weak_reference(_thr_attr_setguardsize, _pthread_attr_setguardsize);
 
 int
 _thr_attr_setguardsize(pthread_attr_t *attr, size_t guardsize)
 {
 	int	ret;
 
 	/* Check for invalid arguments. */
 	if (attr == NULL || *attr == NULL)
 		ret = EINVAL;
 	else {
 		/* Save the stack size. */
 		(*attr)->guardsize_attr = guardsize;
 		ret = 0;
 	}
 	return (ret);
 }
 
 __weak_reference(_thr_attr_setinheritsched, pthread_attr_setinheritsched);
 __weak_reference(_thr_attr_setinheritsched, _pthread_attr_setinheritsched);
 
 int
 _thr_attr_setinheritsched(pthread_attr_t *attr, int sched_inherit)
 {
 	int ret = 0;
 
 	if ((attr == NULL) || (*attr == NULL))
 		ret = EINVAL;
 	else if (sched_inherit != PTHREAD_INHERIT_SCHED &&
 		 sched_inherit != PTHREAD_EXPLICIT_SCHED)
 		ret = ENOTSUP;
 	else
 		(*attr)->sched_inherit = sched_inherit;
 
 	return (ret);
 }
 
 __weak_reference(_thr_attr_setschedparam, pthread_attr_setschedparam);
 __weak_reference(_thr_attr_setschedparam, _pthread_attr_setschedparam);
 
 int
 _thr_attr_setschedparam(pthread_attr_t * __restrict attr,
     const struct sched_param * __restrict param)
 {
 	int policy;
 
 	if ((attr == NULL) || (*attr == NULL))
 		return (EINVAL);
 
 	if (param == NULL)
 		return (ENOTSUP);
 
 	policy = (*attr)->sched_policy;
 
 	if (policy == SCHED_FIFO || policy == SCHED_RR) {
 		if (param->sched_priority < _thr_priorities[policy-1].pri_min ||
 		    param->sched_priority > _thr_priorities[policy-1].pri_max)
 		return (ENOTSUP);
 	} else {
 		/*
 		 * Ignore it for SCHED_OTHER now, patches for glib ports
 		 * are wrongly using M:N thread library's internal macro
 		 * THR_MIN_PRIORITY and THR_MAX_PRIORITY.
 		 */
 	}
 
 	(*attr)->prio = param->sched_priority;
 
 	return (0);
 }
 
 __weak_reference(_thr_attr_setschedpolicy, pthread_attr_setschedpolicy);
 __weak_reference(_thr_attr_setschedpolicy, _pthread_attr_setschedpolicy);
 
 int
 _thr_attr_setschedpolicy(pthread_attr_t *attr, int policy)
 {
 	int ret = 0;
 
 	if ((attr == NULL) || (*attr == NULL))
 		ret = EINVAL;
 	else if ((policy < SCHED_FIFO) || (policy > SCHED_RR)) {
 		ret = ENOTSUP;
 	} else {
 		(*attr)->sched_policy = policy;
 		(*attr)->prio = _thr_priorities[policy-1].pri_default;
 	}
 	return (ret);
 }
 
 __weak_reference(_thr_attr_setscope, pthread_attr_setscope);
 __weak_reference(_thr_attr_setscope, _pthread_attr_setscope);
 
 int
 _thr_attr_setscope(pthread_attr_t *attr, int contentionscope)
 {
 	int ret = 0;
 
 	if ((attr == NULL) || (*attr == NULL)) {
 		/* Return an invalid argument: */
 		ret = EINVAL;
 	} else if ((contentionscope != PTHREAD_SCOPE_PROCESS) &&
 	    (contentionscope != PTHREAD_SCOPE_SYSTEM)) {
 		ret = EINVAL;
 	} else if (contentionscope == PTHREAD_SCOPE_SYSTEM) {
 		(*attr)->flags |= contentionscope;
 	} else {
 		(*attr)->flags &= ~PTHREAD_SCOPE_SYSTEM;
 	}
 	return (ret);
 }
 
 __weak_reference(_pthread_attr_setstack, pthread_attr_setstack);
 
 int
 _pthread_attr_setstack(pthread_attr_t *attr, void *stackaddr,
                         size_t stacksize)
 {
 	int     ret;
 
 	/* Check for invalid arguments: */
 	if (attr == NULL || *attr == NULL || stackaddr == NULL
 	    || stacksize < PTHREAD_STACK_MIN)
 		ret = EINVAL;
 	else {
 		/* Save the stack address and stack size */
 		(*attr)->stackaddr_attr = stackaddr;
 		(*attr)->stacksize_attr = stacksize;
 		ret = 0;
 	}
 	return (ret);
 }
 
 __weak_reference(_thr_attr_setstackaddr, pthread_attr_setstackaddr);
 __weak_reference(_thr_attr_setstackaddr, _pthread_attr_setstackaddr);
 
 int
 _thr_attr_setstackaddr(pthread_attr_t *attr, void *stackaddr)
 {
 	int	ret;
 
 	/* Check for invalid arguments: */
 	if (attr == NULL || *attr == NULL || stackaddr == NULL)
 		ret = EINVAL;
 	else {
 		/* Save the stack address: */
 		(*attr)->stackaddr_attr = stackaddr;
 		ret = 0;
 	}
 	return(ret);
 }
 
 __weak_reference(_thr_attr_setstacksize, pthread_attr_setstacksize);
 __weak_reference(_thr_attr_setstacksize, _pthread_attr_setstacksize);
 
 int
 _thr_attr_setstacksize(pthread_attr_t *attr, size_t stacksize)
 {
 	int	ret;
 
 	/* Check for invalid arguments: */
 	if (attr == NULL || *attr == NULL || stacksize < PTHREAD_STACK_MIN)
 		ret = EINVAL;
 	else {
 		/* Save the stack size: */
 		(*attr)->stacksize_attr = stacksize;
 		ret = 0;
 	}
 	return (ret);
 }
 
 static size_t
 _get_kern_cpuset_size(void)
 {
 	static int kern_cpuset_size = 0;
 
 	if (kern_cpuset_size == 0) {
 		size_t len;
 
 		len = sizeof(kern_cpuset_size);
-		if (sysctlbyname("kern.sched.cpusetsize", &kern_cpuset_size,
-		    &len, NULL, 0))
+		if (sysctlbyname("kern.sched.cpusetsizemin", &kern_cpuset_size,
+		    &len, NULL, 0) != 0 &&
+		    sysctlbyname("kern.sched.cpusetsize", &kern_cpuset_size,
+		    &len, NULL, 0) != 0)
 			PANIC("failed to get sysctl kern.sched.cpusetsize");
 	}
 
 	return (kern_cpuset_size);
 }
 
 __weak_reference(_pthread_attr_setaffinity_np, pthread_attr_setaffinity_np);
 int
 _pthread_attr_setaffinity_np(pthread_attr_t *pattr, size_t cpusetsize,
 	const cpuset_t *cpusetp)
 {
 	pthread_attr_t attr;
 	int ret;
 
 	if (pattr == NULL || (attr = (*pattr)) == NULL)
 		ret = EINVAL;
 	else {
 		if (cpusetsize == 0 || cpusetp == NULL) {
 			if (attr->cpuset != NULL) {
 				free(attr->cpuset);
 				attr->cpuset = NULL;
 				attr->cpusetsize = 0;
 			}
 			return (0);
 		}
 		size_t kern_size = _get_kern_cpuset_size();
 		/* Kernel rejects small set, we check it here too. */ 
 		if (cpusetsize < kern_size)
 			return (ERANGE);
 		if (cpusetsize > kern_size) {
 			/* Kernel checks invalid bits, we check it here too. */
 			size_t i;
 			for (i = kern_size; i < cpusetsize; ++i) {
 				if (((const char *)cpusetp)[i])
 					return (EINVAL);
 			}
 		}
 		if (attr->cpuset == NULL) {
 			attr->cpuset = calloc(1, kern_size);
 			if (attr->cpuset == NULL)
 				return (errno);
 			attr->cpusetsize = kern_size;
 		}
 		memcpy(attr->cpuset, cpusetp, kern_size);
 		ret = 0;
 	}
 	return (ret);
 }
 
 __weak_reference(_pthread_attr_getaffinity_np, pthread_attr_getaffinity_np);
 int
 _pthread_attr_getaffinity_np(const pthread_attr_t *pattr, size_t cpusetsize,
 	cpuset_t *cpusetp)
 {
 	pthread_attr_t attr;
 	int ret = 0;
 
 	if (pattr == NULL || (attr = (*pattr)) == NULL)
 		ret = EINVAL;
 	else {
 		/* Kernel rejects small set, we check it here too. */ 
 		size_t kern_size = _get_kern_cpuset_size();
 		if (cpusetsize < kern_size)
 			return (ERANGE);
 		if (attr->cpuset != NULL)
 			memcpy(cpusetp, attr->cpuset, MIN(cpusetsize,
 			   attr->cpusetsize));
 		else
 			memset(cpusetp, -1, kern_size);
 		if (cpusetsize > kern_size)
 			memset(((char *)cpusetp) + kern_size, 0, 
 				cpusetsize - kern_size);
 	}
 	return (ret);
 }
diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c
index affc48e78862..d2bbfff1337d 100644
--- a/sys/kern/kern_cpuset.c
+++ b/sys/kern/kern_cpuset.c
@@ -1,2552 +1,2558 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  * 
  * Copyright (c) 2008 Nokia Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/ctype.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/capsicum.h>
 #include <sys/cpuset.h>
 #include <sys/domainset.h>
 #include <sys/sx.h>
 #include <sys/queue.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/vmmeter.h>
 #include <sys/ktrace.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_pagequeue.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif /* DDB */
 
 /*
  * cpusets provide a mechanism for creating and manipulating sets of
  * processors for the purpose of constraining the scheduling of threads to
  * specific processors.
  *
  * Each process belongs to an identified set, by default this is set 1.  Each
  * thread may further restrict the cpus it may run on to a subset of this
  * named set.  This creates an anonymous set which other threads and processes
  * may not join by number.
  *
  * The named set is referred to herein as the 'base' set to avoid ambiguity.
  * This set is usually a child of a 'root' set while the anonymous set may
  * simply be referred to as a mask.  In the syscall api these are referred to
  * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
  *
  * Threads inherit their set from their creator whether it be anonymous or
  * not.  This means that anonymous sets are immutable because they may be
  * shared.  To modify an anonymous set a new set is created with the desired
  * mask and the same parent as the existing anonymous set.  This gives the
  * illusion of each thread having a private mask.
  *
  * Via the syscall apis a user may ask to retrieve or modify the root, base,
  * or mask that is discovered via a pid, tid, or setid.  Modifying a set
  * modifies all numbered and anonymous child sets to comply with the new mask.
  * Modifying a pid or tid's mask applies only to that tid but must still
  * exist within the assigned parent set.
  *
  * A thread may not be assigned to a group separate from other threads in
  * the process.  This is to remove ambiguity when the setid is queried with
  * a pid argument.  There is no other technical limitation.
  *
  * This somewhat complex arrangement is intended to make it easy for
  * applications to query available processors and bind their threads to
  * specific processors while also allowing administrators to dynamically
  * reprovision by changing sets which apply to groups of processes.
  *
  * A simple application should not concern itself with sets at all and
  * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
  * meaning 'curthread'.  It may query available cpus for that tid with a
  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
  */
 
 LIST_HEAD(domainlist, domainset);
 struct domainset __read_mostly domainset_firsttouch;
 struct domainset __read_mostly domainset_fixed[MAXMEMDOM];
 struct domainset __read_mostly domainset_interleave;
 struct domainset __read_mostly domainset_prefer[MAXMEMDOM];
 struct domainset __read_mostly domainset_roundrobin;
 
 static uma_zone_t cpuset_zone;
 static uma_zone_t domainset_zone;
 static struct mtx cpuset_lock;
 static struct setlist cpuset_ids;
 static struct domainlist cpuset_domains;
 static struct unrhdr *cpuset_unr;
 static struct cpuset *cpuset_zero, *cpuset_default, *cpuset_kernel;
 static struct domainset *domainset0, *domainset2;
+u_int cpusetsizemin = 1;
 
 /* Return the size of cpuset_t at the kernel level */
 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)");
 
+/* Return the minimum size of cpuset_t allowed by the kernel */
+SYSCTL_UINT(_kern_sched, OID_AUTO, cpusetsizemin,
+    CTLFLAG_RD | CTLFLAG_CAPRD, &cpusetsizemin, 0,
+    "The minimum size of cpuset_t allowed by the kernel");
+
 cpuset_t *cpuset_root;
 cpuset_t cpuset_domain[MAXMEMDOM];
 
 static int domainset_valid(const struct domainset *, const struct domainset *);
 
 /*
  * Find the first non-anonymous set starting from 'set'.
  */
 static struct cpuset *
 cpuset_getbase(struct cpuset *set)
 {
 
 	if (set->cs_id == CPUSET_INVALID)
 		set = set->cs_parent;
 	return (set);
 }
 
 /*
  * Walks up the tree from 'set' to find the root.
  */
 static struct cpuset *
 cpuset_getroot(struct cpuset *set)
 {
 
 	while ((set->cs_flags & CPU_SET_ROOT) == 0 && set->cs_parent != NULL)
 		set = set->cs_parent;
 	return (set);
 }
 
 /*
  * Acquire a reference to a cpuset, all pointers must be tracked with refs.
  */
 struct cpuset *
 cpuset_ref(struct cpuset *set)
 {
 
 	refcount_acquire(&set->cs_ref);
 	return (set);
 }
 
 /*
  * Walks up the tree from 'set' to find the root.  Returns the root
  * referenced.
  */
 static struct cpuset *
 cpuset_refroot(struct cpuset *set)
 {
 
 	return (cpuset_ref(cpuset_getroot(set)));
 }
 
 /*
  * Find the first non-anonymous set starting from 'set'.  Returns this set
  * referenced.  May return the passed in set with an extra ref if it is
  * not anonymous. 
  */
 static struct cpuset *
 cpuset_refbase(struct cpuset *set)
 {
 
 	return (cpuset_ref(cpuset_getbase(set)));
 }
 
 /*
  * Release a reference in a context where it is safe to allocate.
  */
 void
 cpuset_rel(struct cpuset *set)
 {
 	cpusetid_t id;
 
 	if (refcount_release_if_not_last(&set->cs_ref))
 		return;
 	mtx_lock_spin(&cpuset_lock);
 	if (!refcount_release(&set->cs_ref)) {
 		mtx_unlock_spin(&cpuset_lock);
 		return;
 	}
 	LIST_REMOVE(set, cs_siblings);
 	id = set->cs_id;
 	if (id != CPUSET_INVALID)
 		LIST_REMOVE(set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 	cpuset_rel(set->cs_parent);
 	uma_zfree(cpuset_zone, set);
 	if (id != CPUSET_INVALID)
 		free_unr(cpuset_unr, id);
 }
 
 /*
  * Deferred release must be used when in a context that is not safe to
  * allocate/free.  This places any unreferenced sets on the list 'head'.
  */
 static void
 cpuset_rel_defer(struct setlist *head, struct cpuset *set)
 {
 
 	if (refcount_release_if_not_last(&set->cs_ref))
 		return;
 	mtx_lock_spin(&cpuset_lock);
 	if (!refcount_release(&set->cs_ref)) {
 		mtx_unlock_spin(&cpuset_lock);
 		return;
 	}
 	LIST_REMOVE(set, cs_siblings);
 	if (set->cs_id != CPUSET_INVALID)
 		LIST_REMOVE(set, cs_link);
 	LIST_INSERT_HEAD(head, set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 }
 
 /*
  * Complete a deferred release.  Removes the set from the list provided to
  * cpuset_rel_defer.
  */
 static void
 cpuset_rel_complete(struct cpuset *set)
 {
 	cpusetid_t id;
 
 	id = set->cs_id;
 	LIST_REMOVE(set, cs_link);
 	cpuset_rel(set->cs_parent);
 	uma_zfree(cpuset_zone, set);
 	if (id != CPUSET_INVALID)
 		free_unr(cpuset_unr, id);
 }
 
 /*
  * Find a set based on an id.  Returns it with a ref.
  */
 static struct cpuset *
 cpuset_lookup(cpusetid_t setid, struct thread *td)
 {
 	struct cpuset *set;
 
 	if (setid == CPUSET_INVALID)
 		return (NULL);
 	mtx_lock_spin(&cpuset_lock);
 	LIST_FOREACH(set, &cpuset_ids, cs_link)
 		if (set->cs_id == setid)
 			break;
 	if (set)
 		cpuset_ref(set);
 	mtx_unlock_spin(&cpuset_lock);
 
 	KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
 	if (set != NULL && jailed(td->td_ucred)) {
 		struct cpuset *jset, *tset;
 
 		jset = td->td_ucred->cr_prison->pr_cpuset;
 		for (tset = set; tset != NULL; tset = tset->cs_parent)
 			if (tset == jset)
 				break;
 		if (tset == NULL) {
 			cpuset_rel(set);
 			set = NULL;
 		}
 	}
 
 	return (set);
 }
 
 /*
  * Initialize a set in the space provided in 'set' with the provided parameters.
  * The set is returned with a single ref.  May return EDEADLK if the set
  * will have no valid cpu based on restrictions from the parent.
  */
 static int
 cpuset_init(struct cpuset *set, struct cpuset *parent,
     const cpuset_t *mask, struct domainset *domain, cpusetid_t id)
 {
 
 	if (domain == NULL)
 		domain = parent->cs_domain;
 	if (mask == NULL)
 		mask = &parent->cs_mask;
 	if (!CPU_OVERLAP(&parent->cs_mask, mask))
 		return (EDEADLK);
 	/* The domain must be prepared ahead of time. */
 	if (!domainset_valid(parent->cs_domain, domain))
 		return (EDEADLK);
 	CPU_COPY(mask, &set->cs_mask);
 	LIST_INIT(&set->cs_children);
 	refcount_init(&set->cs_ref, 1);
 	set->cs_flags = 0;
 	mtx_lock_spin(&cpuset_lock);
 	set->cs_domain = domain;
 	CPU_AND(&set->cs_mask, &set->cs_mask, &parent->cs_mask);
 	set->cs_id = id;
 	set->cs_parent = cpuset_ref(parent);
 	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
 	if (set->cs_id != CPUSET_INVALID)
 		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 
 	return (0);
 }
 
 /*
  * Create a new non-anonymous set with the requested parent and mask.  May
  * return failures if the mask is invalid or a new number can not be
  * allocated.
  *
  * If *setp is not NULL, then it will be used as-is.  The caller must take
  * into account that *setp will be inserted at the head of cpuset_ids and
  * plan any potentially conflicting cs_link usage accordingly.
  */
 static int
 cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
 {
 	struct cpuset *set;
 	cpusetid_t id;
 	int error;
 	bool dofree;
 
 	id = alloc_unr(cpuset_unr);
 	if (id == -1)
 		return (ENFILE);
 	dofree = (*setp == NULL);
 	if (*setp != NULL)
 		set = *setp;
 	else
 		*setp = set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 	error = cpuset_init(set, parent, mask, NULL, id);
 	if (error == 0)
 		return (0);
 	free_unr(cpuset_unr, id);
 	if (dofree)
 		uma_zfree(cpuset_zone, set);
 
 	return (error);
 }
 
 static void
 cpuset_freelist_add(struct setlist *list, int count)
 {
 	struct cpuset *set;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		set = uma_zalloc(cpuset_zone, M_ZERO | M_WAITOK);
 		LIST_INSERT_HEAD(list, set, cs_link);
 	}
 }
 
 static void
 cpuset_freelist_init(struct setlist *list, int count)
 {
 
 	LIST_INIT(list);
 	cpuset_freelist_add(list, count);
 }
 
 static void
 cpuset_freelist_free(struct setlist *list)
 {
 	struct cpuset *set;
 
 	while ((set = LIST_FIRST(list)) != NULL) {
 		LIST_REMOVE(set, cs_link);
 		uma_zfree(cpuset_zone, set);
 	}
 }
 
 static void
 domainset_freelist_add(struct domainlist *list, int count)
 {
 	struct domainset *set;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		set = uma_zalloc(domainset_zone, M_ZERO | M_WAITOK);
 		LIST_INSERT_HEAD(list, set, ds_link);
 	}
 }
 
 static void
 domainset_freelist_init(struct domainlist *list, int count)
 {
 
 	LIST_INIT(list);
 	domainset_freelist_add(list, count);
 }
 
 static void
 domainset_freelist_free(struct domainlist *list)
 {
 	struct domainset *set;
 
 	while ((set = LIST_FIRST(list)) != NULL) {
 		LIST_REMOVE(set, ds_link);
 		uma_zfree(domainset_zone, set);
 	}
 }
 
 /* Copy a domainset preserving mask and policy. */
 static void
 domainset_copy(const struct domainset *from, struct domainset *to)
 {
 
 	DOMAINSET_COPY(&from->ds_mask, &to->ds_mask);
 	to->ds_policy = from->ds_policy;
 	to->ds_prefer = from->ds_prefer;
 }
 
 /* Return 1 if mask and policy are equal, otherwise 0. */
 static int
 domainset_equal(const struct domainset *one, const struct domainset *two)
 {
 
 	return (DOMAINSET_CMP(&one->ds_mask, &two->ds_mask) == 0 &&
 	    one->ds_policy == two->ds_policy &&
 	    one->ds_prefer == two->ds_prefer);
 }
 
 /* Return 1 if child is a valid subset of parent. */
 static int
 domainset_valid(const struct domainset *parent, const struct domainset *child)
 {
 	if (child->ds_policy != DOMAINSET_POLICY_PREFER)
 		return (DOMAINSET_SUBSET(&parent->ds_mask, &child->ds_mask));
 	return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
 }
 
 static int
 domainset_restrict(const struct domainset *parent,
     const struct domainset *child)
 {
 	if (child->ds_policy != DOMAINSET_POLICY_PREFER)
 		return (DOMAINSET_OVERLAP(&parent->ds_mask, &child->ds_mask));
 	return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
 }
 
 /*
  * Lookup or create a domainset.  The key is provided in ds_mask and
  * ds_policy.  If the domainset does not yet exist the storage in
  * 'domain' is used to insert.  Otherwise this storage is freed to the
  * domainset_zone and the existing domainset is returned.
  */
 static struct domainset *
 _domainset_create(struct domainset *domain, struct domainlist *freelist)
 {
 	struct domainset *ndomain;
 	int i, j;
 
 	KASSERT(domain->ds_cnt <= vm_ndomains,
 	    ("invalid domain count in domainset %p", domain));
 	KASSERT(domain->ds_policy != DOMAINSET_POLICY_PREFER ||
 	    domain->ds_prefer < vm_ndomains,
 	    ("invalid preferred domain in domains %p", domain));
 
 	mtx_lock_spin(&cpuset_lock);
 	LIST_FOREACH(ndomain, &cpuset_domains, ds_link)
 		if (domainset_equal(ndomain, domain))
 			break;
 	/*
 	 * If the domain does not yet exist we insert it and initialize
 	 * various iteration helpers which are not part of the key.
 	 */
 	if (ndomain == NULL) {
 		LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link);
 		domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask);
 		for (i = 0, j = 0; i < DOMAINSET_FLS(&domain->ds_mask); i++)
 			if (DOMAINSET_ISSET(i, &domain->ds_mask))
 				domain->ds_order[j++] = i;
 	}
 	mtx_unlock_spin(&cpuset_lock);
 	if (ndomain == NULL)
 		return (domain);
 	if (freelist != NULL)
 		LIST_INSERT_HEAD(freelist, domain, ds_link);
 	else
 		uma_zfree(domainset_zone, domain);
 	return (ndomain);
 
 }
 
 /*
  * Are any of the domains in the mask empty?  If so, silently
  * remove them and update the domainset accordingly.  If only empty
  * domains are present, we must return failure.
  */
 static bool
 domainset_empty_vm(struct domainset *domain)
 {
 	domainset_t empty;
 	int i, j;
 
 	DOMAINSET_ZERO(&empty);
 	for (i = 0; i < vm_ndomains; i++)
 		if (VM_DOMAIN_EMPTY(i))
 			DOMAINSET_SET(i, &empty);
 	if (DOMAINSET_SUBSET(&empty, &domain->ds_mask))
 		return (true);
 
 	/* Remove empty domains from the set and recompute. */
 	DOMAINSET_ANDNOT(&domain->ds_mask, &empty);
 	domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask);
 	for (i = j = 0; i < DOMAINSET_FLS(&domain->ds_mask); i++)
 		if (DOMAINSET_ISSET(i, &domain->ds_mask))
 			domain->ds_order[j++] = i;
 
 	/* Convert a PREFER policy referencing an empty domain to RR. */
 	if (domain->ds_policy == DOMAINSET_POLICY_PREFER &&
 	    DOMAINSET_ISSET(domain->ds_prefer, &empty)) {
 		domain->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
 		domain->ds_prefer = -1;
 	}
 
 	return (false);
 }
 
 /*
  * Create or lookup a domainset based on the key held in 'domain'.
  */
 struct domainset *
 domainset_create(const struct domainset *domain)
 {
 	struct domainset *ndomain;
 
 	/*
 	 * Validate the policy.  It must specify a useable policy number with
 	 * only valid domains.  Preferred must include the preferred domain
 	 * in the mask.
 	 */
 	if (domain->ds_policy <= DOMAINSET_POLICY_INVALID ||
 	    domain->ds_policy > DOMAINSET_POLICY_MAX)
 		return (NULL);
 	if (domain->ds_policy == DOMAINSET_POLICY_PREFER &&
 	    !DOMAINSET_ISSET(domain->ds_prefer, &domain->ds_mask))
 		return (NULL);
 	if (!DOMAINSET_SUBSET(&domainset0->ds_mask, &domain->ds_mask))
 		return (NULL);
 	ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO);
 	domainset_copy(domain, ndomain);
 	return _domainset_create(ndomain, NULL);
 }
 
 /*
  * Update thread domainset pointers.
  */
 static void
 domainset_notify(void)
 {
 	struct thread *td;
 	struct proc *p;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			td->td_domain.dr_policy = td->td_cpuset->cs_domain;
 			thread_unlock(td);
 		}
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 	kernel_object->domain.dr_policy = cpuset_kernel->cs_domain;
 }
 
 /*
  * Create a new set that is a subset of a parent.
  */
 static struct domainset *
 domainset_shadow(const struct domainset *pdomain,
     const struct domainset *domain, struct domainlist *freelist)
 {
 	struct domainset *ndomain;
 
 	ndomain = LIST_FIRST(freelist);
 	LIST_REMOVE(ndomain, ds_link);
 
 	/*
 	 * Initialize the key from the request.
 	 */
 	domainset_copy(domain, ndomain);
 
 	/*
 	 * Restrict the key by the parent.
 	 */
 	DOMAINSET_AND(&ndomain->ds_mask, &pdomain->ds_mask);
 
 	return _domainset_create(ndomain, freelist);
 }
 
 /*
  * Recursively check for errors that would occur from applying mask to
  * the tree of sets starting at 'set'.  Checks for sets that would become
  * empty as well as RDONLY flags.
  */
 static int
 cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int augment_mask)
 {
 	struct cpuset *nset;
 	cpuset_t newmask;
 	int error;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	if (set->cs_flags & CPU_SET_RDONLY)
 		return (EPERM);
 	if (augment_mask) {
 		CPU_AND(&newmask, &set->cs_mask, mask);
 	} else
 		CPU_COPY(mask, &newmask);
 
 	if (CPU_EMPTY(&newmask))
 		return (EDEADLK);
 	error = 0;
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0)
 			break;
 	return (error);
 }
 
 /*
  * Applies the mask 'mask' without checking for empty sets or permissions.
  */
 static void
 cpuset_update(struct cpuset *set, cpuset_t *mask)
 {
 	struct cpuset *nset;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	CPU_AND(&set->cs_mask, &set->cs_mask, mask);
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		cpuset_update(nset, &set->cs_mask);
 
 	return;
 }
 
 /*
  * Modify the set 'set' to use a copy of the mask provided.  Apply this new
  * mask to restrict all children in the tree.  Checks for validity before
  * applying the changes.
  */
 static int
 cpuset_modify(struct cpuset *set, cpuset_t *mask)
 {
 	struct cpuset *root;
 	int error;
 
 	error = priv_check(curthread, PRIV_SCHED_CPUSET);
 	if (error)
 		return (error);
 	/*
 	 * In case we are called from within the jail,
 	 * we do not allow modifying the dedicated root
 	 * cpuset of the jail but may still allow to
 	 * change child sets, including subordinate jails'
 	 * roots.
 	 */
 	if ((set->cs_flags & CPU_SET_ROOT) != 0 &&
 	    jailed(curthread->td_ucred) &&
 	    set == curthread->td_ucred->cr_prison->pr_cpuset)
 		return (EPERM);
 	/*
 	 * Verify that we have access to this set of
 	 * cpus.
 	 */
 	if ((set->cs_flags & (CPU_SET_ROOT | CPU_SET_RDONLY)) == CPU_SET_ROOT) {
 		KASSERT(set->cs_parent != NULL,
 		    ("jail.cpuset=%d is not a proper child of parent jail's root.",
 		    set->cs_id));
 
 		/*
 		 * cpuset_getroot() cannot work here due to how top-level jail
 		 * roots are constructed.  Top-level jails are parented to
 		 * thread0's cpuset (i.e. cpuset 1) rather than the system root.
 		 */
 		root = set->cs_parent;
 	} else {
 		root = cpuset_getroot(set);
 	}
 	mtx_lock_spin(&cpuset_lock);
 	if (root && !CPU_SUBSET(&root->cs_mask, mask)) {
 		error = EINVAL;
 		goto out;
 	}
 	error = cpuset_testupdate(set, mask, 0);
 	if (error)
 		goto out;
 	CPU_COPY(mask, &set->cs_mask);
 	cpuset_update(set, mask);
 out:
 	mtx_unlock_spin(&cpuset_lock);
 
 	return (error);
 }
 
 /*
  * Recursively check for errors that would occur from applying mask to
  * the tree of sets starting at 'set'.  Checks for sets that would become
  * empty as well as RDONLY flags.
  */
 static int
 cpuset_testupdate_domain(struct cpuset *set, struct domainset *dset,
     struct domainset *orig, int *count, int augment_mask __unused)
 {
 	struct cpuset *nset;
 	struct domainset *domain;
 	struct domainset newset;
 	int error;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	if (set->cs_flags & CPU_SET_RDONLY)
 		return (EPERM);
 	domain = set->cs_domain;
 	domainset_copy(domain, &newset);
 	if (!domainset_equal(domain, orig)) {
 		if (!domainset_restrict(domain, dset))
 			return (EDEADLK);
 		DOMAINSET_AND(&newset.ds_mask, &dset->ds_mask);
 		/* Count the number of domains that are changing. */
 		(*count)++;
 	}
 	error = 0;
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		if ((error = cpuset_testupdate_domain(nset, &newset, domain,
 		    count, 1)) != 0)
 			break;
 	return (error);
 }
 
 /*
  * Applies the mask 'mask' without checking for empty sets or permissions.
  */
 static void
 cpuset_update_domain(struct cpuset *set, struct domainset *domain,
     struct domainset *orig, struct domainlist *domains)
 {
 	struct cpuset *nset;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	/*
 	 * If this domainset has changed from the parent we must calculate
 	 * a new set.  Otherwise it simply inherits from the parent.  When
 	 * we inherit from the parent we get a new mask and policy.  If the
 	 * set is modified from the parent we keep the policy and only
 	 * update the mask.
 	 */
 	if (set->cs_domain != orig) {
 		orig = set->cs_domain;
 		set->cs_domain = domainset_shadow(domain, orig, domains);
 	} else
 		set->cs_domain = domain;
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		cpuset_update_domain(nset, set->cs_domain, orig, domains);
 
 	return;
 }
 
 /*
  * Modify the set 'set' to use a copy the domainset provided.  Apply this new
  * mask to restrict all children in the tree.  Checks for validity before
  * applying the changes.
  */
 static int
 cpuset_modify_domain(struct cpuset *set, struct domainset *domain)
 {
 	struct domainlist domains;
 	struct domainset temp;
 	struct domainset *dset;
 	struct cpuset *root;
 	int ndomains, needed;
 	int error;
 
 	error = priv_check(curthread, PRIV_SCHED_CPUSET);
 	if (error)
 		return (error);
 	/*
 	 * In case we are called from within the jail
 	 * we do not allow modifying the dedicated root
 	 * cpuset of the jail but may still allow to
 	 * change child sets.
 	 */
 	if (jailed(curthread->td_ucred) &&
 	    set->cs_flags & CPU_SET_ROOT)
 		return (EPERM);
 	domainset_freelist_init(&domains, 0);
 	domain = domainset_create(domain);
 	ndomains = 0;
 
 	mtx_lock_spin(&cpuset_lock);
 	for (;;) {
 		root = cpuset_getroot(set);
 		dset = root->cs_domain;
 		/*
 		 * Verify that we have access to this set of domains.
 		 */
 		if (!domainset_valid(dset, domain)) {
 			error = EINVAL;
 			goto out;
 		}
 		/*
 		 * If applying prefer we keep the current set as the fallback.
 		 */
 		if (domain->ds_policy == DOMAINSET_POLICY_PREFER)
 			DOMAINSET_COPY(&set->cs_domain->ds_mask,
 			    &domain->ds_mask);
 		/*
 		 * Determine whether we can apply this set of domains and
 		 * how many new domain structures it will require.
 		 */
 		domainset_copy(domain, &temp);
 		needed = 0;
 		error = cpuset_testupdate_domain(set, &temp, set->cs_domain,
 		    &needed, 0);
 		if (error)
 			goto out;
 		if (ndomains >= needed)
 			break;
 
 		/* Dropping the lock; we'll need to re-evaluate again. */
 		mtx_unlock_spin(&cpuset_lock);
 		domainset_freelist_add(&domains, needed - ndomains);
 		ndomains = needed;
 		mtx_lock_spin(&cpuset_lock);
 	}
 	dset = set->cs_domain;
 	cpuset_update_domain(set, domain, dset, &domains);
 out:
 	mtx_unlock_spin(&cpuset_lock);
 	domainset_freelist_free(&domains);
 	if (error == 0)
 		domainset_notify();
 
 	return (error);
 }
 
 /*
  * Resolve the 'which' parameter of several cpuset apis.
  *
  * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
  * checks for permission via p_cansched().
  *
  * For WHICH_SET returns a valid set with a new reference.
  *
  * -1 may be supplied for any argument to mean the current proc/thread or
  * the base set of the current thread.  May fail with ESRCH/EPERM.
  */
 int
 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
     struct cpuset **setp)
 {
 	struct cpuset *set;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	*pp = p = NULL;
 	*tdp = td = NULL;
 	*setp = set = NULL;
 	switch (which) {
 	case CPU_WHICH_PID:
 		if (id == -1) {
 			PROC_LOCK(curproc);
 			p = curproc;
 			break;
 		}
 		if ((p = pfind(id)) == NULL)
 			return (ESRCH);
 		break;
 	case CPU_WHICH_TID:
 		if (id == -1) {
 			PROC_LOCK(curproc);
 			p = curproc;
 			td = curthread;
 			break;
 		}
 		td = tdfind(id, -1);
 		if (td == NULL)
 			return (ESRCH);
 		p = td->td_proc;
 		break;
 	case CPU_WHICH_CPUSET:
 		if (id == -1) {
 			thread_lock(curthread);
 			set = cpuset_refbase(curthread->td_cpuset);
 			thread_unlock(curthread);
 		} else
 			set = cpuset_lookup(id, curthread);
 		if (set) {
 			*setp = set;
 			return (0);
 		}
 		return (ESRCH);
 	case CPU_WHICH_JAIL:
 	{
 		/* Find `set' for prison with given id. */
 		struct prison *pr;
 
 		sx_slock(&allprison_lock);
 		pr = prison_find_child(curthread->td_ucred->cr_prison, id);
 		sx_sunlock(&allprison_lock);
 		if (pr == NULL)
 			return (ESRCH);
 		cpuset_ref(pr->pr_cpuset);
 		*setp = pr->pr_cpuset;
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 	case CPU_WHICH_IRQ:
 	case CPU_WHICH_DOMAIN:
 		return (0);
 	default:
 		return (EINVAL);
 	}
 	error = p_cansched(curthread, p);
 	if (error) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (td == NULL)
 		td = FIRST_THREAD_IN_PROC(p);
 	*pp = p;
 	*tdp = td;
 	return (0);
 }
 
 static int
 cpuset_testshadow(struct cpuset *set, const cpuset_t *mask,
     const struct domainset *domain)
 {
 	struct cpuset *parent;
 	struct domainset *dset;
 
 	parent = cpuset_getbase(set);
 	/*
 	 * If we are restricting a cpu mask it must be a subset of the
 	 * parent or invalid CPUs have been specified.
 	 */
 	if (mask != NULL && !CPU_SUBSET(&parent->cs_mask, mask))
 		return (EINVAL);
 
 	/*
 	 * If we are restricting a domain mask it must be a subset of the
 	 * parent or invalid domains have been specified.
 	 */
 	dset = parent->cs_domain;
 	if (domain != NULL && !domainset_valid(dset, domain))
 		return (EINVAL);
 
 	return (0);
 }
 
 /*
  * Create an anonymous set with the provided mask in the space provided by
  * 'nset'.  If the passed in set is anonymous we use its parent otherwise
  * the new set is a child of 'set'.
  */
 static int
 cpuset_shadow(struct cpuset *set, struct cpuset **nsetp,
    const cpuset_t *mask, const struct domainset *domain,
    struct setlist *cpusets, struct domainlist *domains)
 {
 	struct cpuset *parent;
 	struct cpuset *nset;
 	struct domainset *dset;
 	struct domainset *d;
 	int error;
 
 	error = cpuset_testshadow(set, mask, domain);
 	if (error)
 		return (error);
 
 	parent = cpuset_getbase(set);
 	dset = parent->cs_domain;
 	if (mask == NULL)
 		mask = &set->cs_mask;
 	if (domain != NULL)
 		d = domainset_shadow(dset, domain, domains);
 	else
 		d = set->cs_domain;
 	nset = LIST_FIRST(cpusets);
 	error = cpuset_init(nset, parent, mask, d, CPUSET_INVALID);
 	if (error == 0) {
 		LIST_REMOVE(nset, cs_link);
 		*nsetp = nset;
 	}
 	return (error);
 }
 
 static struct cpuset *
 cpuset_update_thread(struct thread *td, struct cpuset *nset)
 {
 	struct cpuset *tdset;
 
 	tdset = td->td_cpuset;
 	td->td_cpuset = nset;
 	td->td_domain.dr_policy = nset->cs_domain;
 	sched_affinity(td);
 
 	return (tdset);
 }
 
 static int
 cpuset_setproc_test_maskthread(struct cpuset *tdset, cpuset_t *mask,
     struct domainset *domain)
 {
 	struct cpuset *parent;
 
 	parent = cpuset_getbase(tdset);
 	if (mask == NULL)
 		mask = &tdset->cs_mask;
 	if (domain == NULL)
 		domain = tdset->cs_domain;
 	return cpuset_testshadow(parent, mask, domain);
 }
 
 static int
 cpuset_setproc_maskthread(struct cpuset *tdset, cpuset_t *mask,
     struct domainset *domain, struct cpuset **nsetp,
     struct setlist *freelist, struct domainlist *domainlist)
 {
 	struct cpuset *parent;
 
 	parent = cpuset_getbase(tdset);
 	if (mask == NULL)
 		mask = &tdset->cs_mask;
 	if (domain == NULL)
 		domain = tdset->cs_domain;
 	return cpuset_shadow(parent, nsetp, mask, domain, freelist,
 	    domainlist);
 }
 
 static int
 cpuset_setproc_setthread_mask(struct cpuset *tdset, struct cpuset *set,
     cpuset_t *mask, struct domainset *domain)
 {
 	struct cpuset *parent;
 
 	parent = cpuset_getbase(tdset);
 
 	/*
 	 * If the thread restricted its mask then apply that same
 	 * restriction to the new set, otherwise take it wholesale.
 	 */
 	if (CPU_CMP(&tdset->cs_mask, &parent->cs_mask) != 0) {
 		CPU_AND(mask, &tdset->cs_mask, &set->cs_mask);
 	} else
 		CPU_COPY(&set->cs_mask, mask);
 
 	/*
 	 * If the thread restricted the domain then we apply the
 	 * restriction to the new set but retain the policy.
 	 */
 	if (tdset->cs_domain != parent->cs_domain) {
 		domainset_copy(tdset->cs_domain, domain);
 		DOMAINSET_AND(&domain->ds_mask, &set->cs_domain->ds_mask);
 	} else
 		domainset_copy(set->cs_domain, domain);
 
 	if (CPU_EMPTY(mask) || DOMAINSET_EMPTY(&domain->ds_mask))
 		return (EDEADLK);
 
 	return (0);
 }
 
 static int
 cpuset_setproc_test_setthread(struct cpuset *tdset, struct cpuset *set)
 {
 	struct domainset domain;
 	cpuset_t mask;
 
 	if (tdset->cs_id != CPUSET_INVALID)
 		return (0);
 	return cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
 }
 
 static int
 cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set,
     struct cpuset **nsetp, struct setlist *freelist,
     struct domainlist *domainlist)
 {
 	struct domainset domain;
 	cpuset_t mask;
 	int error;
 
 	/*
 	 * If we're replacing on a thread that has not constrained the
 	 * original set we can simply accept the new set.
 	 */
 	if (tdset->cs_id != CPUSET_INVALID) {
 		*nsetp = cpuset_ref(set);
 		return (0);
 	}
 	error = cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
 	if (error)
 		return (error);
 
 	return cpuset_shadow(set, nsetp, &mask, &domain, freelist,
 	    domainlist);
 }
 
 static int
 cpuset_setproc_newbase(struct thread *td, struct cpuset *set,
     struct cpuset *nroot, struct cpuset **nsetp,
     struct setlist *cpusets, struct domainlist *domainlist)
 {
 	struct domainset ndomain;
 	cpuset_t nmask;
 	struct cpuset *pbase;
 	int error;
 
 	pbase = cpuset_getbase(td->td_cpuset);
 
 	/* Copy process mask, then further apply the new root mask. */
 	CPU_AND(&nmask, &pbase->cs_mask, &nroot->cs_mask);
 
 	domainset_copy(pbase->cs_domain, &ndomain);
 	DOMAINSET_AND(&ndomain.ds_mask, &set->cs_domain->ds_mask);
 
 	/* Policy is too restrictive, will not work. */
 	if (CPU_EMPTY(&nmask) || DOMAINSET_EMPTY(&ndomain.ds_mask))
 		return (EDEADLK);
 
 	/*
 	 * Remove pbase from the freelist in advance, it'll be pushed to
 	 * cpuset_ids on success.  We assume here that cpuset_create() will not
 	 * touch pbase on failure, and we just enqueue it back to the freelist
 	 * to remain in a consistent state.
 	 */
 	pbase = LIST_FIRST(cpusets);
 	LIST_REMOVE(pbase, cs_link);
 	error = cpuset_create(&pbase, set, &nmask);
 	if (error != 0) {
 		LIST_INSERT_HEAD(cpusets, pbase, cs_link);
 		return (error);
 	}
 
 	/* Duplicates some work from above... oh well. */
 	pbase->cs_domain = domainset_shadow(set->cs_domain, &ndomain,
 	    domainlist);
 	*nsetp = pbase;
 	return (0);
 }
 
 /*
  * Handle four cases for updating an entire process.
  *
  * 1) Set is non-null and the process is not rebasing onto a new root.  This
  *    reparents all anonymous sets to the provided set and replaces all
  *    non-anonymous td_cpusets with the provided set.
  * 2) Set is non-null and the process is rebasing onto a new root.  This
  *    creates a new base set if the process previously had its own base set,
  *    then reparents all anonymous sets either to that set or the provided set
  *    if one was not created.  Non-anonymous sets are similarly replaced.
  * 3) Mask is non-null.  This replaces or creates anonymous sets for every
  *    thread with the existing base as a parent.
  * 4) domain is non-null.  This creates anonymous sets for every thread
  *    and replaces the domain set.
  *
  * This is overly complicated because we can't allocate while holding a 
  * spinlock and spinlocks must be held while changing and examining thread
  * state.
  */
 static int
 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
     struct domainset *domain, bool rebase)
 {
 	struct setlist freelist;
 	struct setlist droplist;
 	struct domainlist domainlist;
 	struct cpuset *base, *nset, *nroot, *tdroot;
 	struct thread *td;
 	struct proc *p;
 	int needed;
 	int nfree;
 	int error;
 
 	/*
 	 * The algorithm requires two passes due to locking considerations.
 	 * 
 	 * 1) Lookup the process and acquire the locks in the required order.
 	 * 2) If enough cpusets have not been allocated release the locks and
 	 *    allocate them.  Loop.
 	 */
 	cpuset_freelist_init(&freelist, 1);
 	domainset_freelist_init(&domainlist, 1);
 	nfree = 1;
 	LIST_INIT(&droplist);
 	nfree = 0;
 	base = set;
 	nroot = NULL;
 	if (set != NULL)
 		nroot = cpuset_getroot(set);
 	for (;;) {
 		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
 		if (error)
 			goto out;
 		tdroot = cpuset_getroot(td->td_cpuset);
 		needed = p->p_numthreads;
 		if (set != NULL && rebase && tdroot != nroot)
 			needed++;
 		if (nfree >= needed)
 			break;
 		PROC_UNLOCK(p);
 		if (nfree < needed) {
 			cpuset_freelist_add(&freelist, needed - nfree);
 			domainset_freelist_add(&domainlist, needed - nfree);
 			nfree = needed;
 		}
 	}
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * If we're changing roots and the root set is what has been specified
 	 * as the parent, then we'll check if the process was previously using
 	 * the root set and, if it wasn't, create a new base with the process's
 	 * mask applied to it.
 	 *
 	 * If the new root is incompatible with the existing mask, then we allow
 	 * the process to take on the new root if and only if they have
 	 * privilege to widen their mask anyways.  Unprivileged processes get
 	 * rejected with EDEADLK.
 	 */
 	if (set != NULL && rebase && nroot != tdroot) {
 		cpusetid_t base_id, root_id;
 
 		root_id = td->td_ucred->cr_prison->pr_cpuset->cs_id;
 		base_id = cpuset_getbase(td->td_cpuset)->cs_id;
 
 		if (base_id != root_id) {
 			error = cpuset_setproc_newbase(td, set, nroot, &base,
 			    &freelist, &domainlist);
 			if (error == EDEADLK &&
 			    priv_check(td, PRIV_SCHED_CPUSET) == 0)
 				error = 0;
 			if (error != 0)
 				goto unlock_out;
 		}
 	}
 
 	/*
 	 * Now that the appropriate locks are held and we have enough cpusets,
 	 * make sure the operation will succeed before applying changes. The
 	 * proc lock prevents td_cpuset from changing between calls.
 	 */
 	error = 0;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		if (set != NULL)
 			error = cpuset_setproc_test_setthread(td->td_cpuset,
 			    base);
 		else
 			error = cpuset_setproc_test_maskthread(td->td_cpuset,
 			    mask, domain);
 		thread_unlock(td);
 		if (error)
 			goto unlock_out;
 	}
 	/*
 	 * Replace each thread's cpuset while using deferred release.  We
 	 * must do this because the thread lock must be held while operating
 	 * on the thread and this limits the type of operations allowed.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		if (set != NULL)
 			error = cpuset_setproc_setthread(td->td_cpuset, base,
 			    &nset, &freelist, &domainlist);
 		else
 			error = cpuset_setproc_maskthread(td->td_cpuset, mask,
 			    domain, &nset, &freelist, &domainlist);
 		if (error) {
 			thread_unlock(td);
 			break;
 		}
 		cpuset_rel_defer(&droplist, cpuset_update_thread(td, nset));
 		thread_unlock(td);
 	}
 unlock_out:
 	PROC_UNLOCK(p);
 out:
 	if (base != NULL && base != set)
 		cpuset_rel(base);
 	while ((nset = LIST_FIRST(&droplist)) != NULL)
 		cpuset_rel_complete(nset);
 	cpuset_freelist_free(&freelist);
 	domainset_freelist_free(&domainlist);
 	return (error);
 }
 
 static int
 bitset_strprint(char *buf, size_t bufsiz, const struct bitset *set, int setlen)
 {
 	size_t bytes;
 	int i, once;
 	char *p;
 
 	once = 0;
 	p = buf;
 	for (i = 0; i < __bitset_words(setlen); i++) {
 		if (once != 0) {
 			if (bufsiz < 1)
 				return (0);
 			*p = ',';
 			p++;
 			bufsiz--;
 		} else
 			once = 1;
 		if (bufsiz < sizeof(__STRING(ULONG_MAX)))
 			return (0);
 		bytes = snprintf(p, bufsiz, "%lx", set->__bits[i]);
 		p += bytes;
 		bufsiz -= bytes;
 	}
 	return (p - buf);
 }
 
 static int
 bitset_strscan(struct bitset *set, int setlen, const char *buf)
 {
 	int i, ret;
 	const char *p;
 
 	BIT_ZERO(setlen, set);
 	p = buf;
 	for (i = 0; i < __bitset_words(setlen); i++) {
 		if (*p == ',') {
 			p++;
 			continue;
 		}
 		ret = sscanf(p, "%lx", &set->__bits[i]);
 		if (ret == 0 || ret == -1)
 			break;
 		while (isxdigit(*p))
 			p++;
 	}
 	return (p - buf);
 }
 
 /*
  * Return a string representing a valid layout for a cpuset_t object.
  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
  */
 char *
 cpusetobj_strprint(char *buf, const cpuset_t *set)
 {
 
 	bitset_strprint(buf, CPUSETBUFSIZ, (const struct bitset *)set,
 	    CPU_SETSIZE);
 	return (buf);
 }
 
 /*
  * Build a valid cpuset_t object from a string representation.
  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
  */
 int
 cpusetobj_strscan(cpuset_t *set, const char *buf)
 {
 	char p;
 
 	if (strlen(buf) > CPUSETBUFSIZ - 1)
 		return (-1);
 
 	p = buf[bitset_strscan((struct bitset *)set, CPU_SETSIZE, buf)];
 	if (p != '\0')
 		return (-1);
 
 	return (0);
 }
 
 /*
  * Handle a domainset specifier in the sysctl tree.  A poiner to a pointer to
  * a domainset is in arg1.  If the user specifies a valid domainset the
  * pointer is updated.
  *
  * Format is:
  * hex mask word 0,hex mask word 1,...:decimal policy:decimal preferred
  */
 int
 sysctl_handle_domainset(SYSCTL_HANDLER_ARGS)
 {
 	char buf[DOMAINSETBUFSIZ];
 	struct domainset *dset;
 	struct domainset key;
 	int policy, prefer, error;
 	char *p;
 
 	dset = *(struct domainset **)arg1;
 	error = 0;
 
 	if (dset != NULL) {
 		p = buf + bitset_strprint(buf, DOMAINSETBUFSIZ,
 		    (const struct bitset *)&dset->ds_mask, DOMAINSET_SETSIZE);
 		sprintf(p, ":%d:%d", dset->ds_policy, dset->ds_prefer);
 	} else
 		sprintf(buf, "<NULL>");
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	/*
 	 * Read in and validate the string.
 	 */
 	memset(&key, 0, sizeof(key));
 	p = &buf[bitset_strscan((struct bitset *)&key.ds_mask,
 	    DOMAINSET_SETSIZE, buf)];
 	if (p == buf)
 		return (EINVAL);
 	if (sscanf(p, ":%d:%d", &policy, &prefer) != 2)
 		return (EINVAL);
 	key.ds_policy = policy;
 	key.ds_prefer = prefer;
 
 	/* Domainset_create() validates the policy.*/
 	dset = domainset_create(&key);
 	if (dset == NULL)
 		return (EINVAL);
 	*(struct domainset **)arg1 = dset;
 
 	return (error);
 }
 
 /*
  * Apply an anonymous mask or a domain to a single thread.
  */
 static int
 _cpuset_setthread(lwpid_t id, cpuset_t *mask, struct domainset *domain)
 {
 	struct setlist cpusets;
 	struct domainlist domainlist;
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	cpuset_freelist_init(&cpusets, 1);
 	domainset_freelist_init(&domainlist, domain != NULL);
 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
 	if (error)
 		goto out;
 	set = NULL;
 	thread_lock(td);
 	error = cpuset_shadow(td->td_cpuset, &nset, mask, domain,
 	    &cpusets, &domainlist);
 	if (error == 0)
 		set = cpuset_update_thread(td, nset);
 	thread_unlock(td);
 	PROC_UNLOCK(p);
 	if (set)
 		cpuset_rel(set);
 out:
 	cpuset_freelist_free(&cpusets);
 	domainset_freelist_free(&domainlist);
 	return (error);
 }
 
 /*
  * Apply an anonymous mask to a single thread.
  */
 int
 cpuset_setthread(lwpid_t id, cpuset_t *mask)
 {
 
 	return _cpuset_setthread(id, mask, NULL);
 }
 
 /*
  * Apply new cpumask to the ithread.
  */
 int
 cpuset_setithread(lwpid_t id, int cpu)
 {
 	cpuset_t mask;
 
 	CPU_ZERO(&mask);
 	if (cpu == NOCPU)
 		CPU_COPY(cpuset_root, &mask);
 	else
 		CPU_SET(cpu, &mask);
 	return _cpuset_setthread(id, &mask, NULL);
 }
 
 /*
  * Initialize static domainsets after NUMA information is available.  This is
  * called before memory allocators are initialized.
  */
 void
 domainset_init(void)
 {
 	struct domainset *dset;
 	int i;
 
 	dset = &domainset_firsttouch;
 	DOMAINSET_COPY(&all_domains, &dset->ds_mask);
 	dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH;
 	dset->ds_prefer = -1;
 	_domainset_create(dset, NULL);
 
 	dset = &domainset_interleave;
 	DOMAINSET_COPY(&all_domains, &dset->ds_mask);
 	dset->ds_policy = DOMAINSET_POLICY_INTERLEAVE;
 	dset->ds_prefer = -1;
 	_domainset_create(dset, NULL);
 
 	dset = &domainset_roundrobin;
 	DOMAINSET_COPY(&all_domains, &dset->ds_mask);
 	dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
 	dset->ds_prefer = -1;
 	_domainset_create(dset, NULL);
 
 	for (i = 0; i < vm_ndomains; i++) {
 		dset = &domainset_fixed[i];
 		DOMAINSET_ZERO(&dset->ds_mask);
 		DOMAINSET_SET(i, &dset->ds_mask);
 		dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
 		_domainset_create(dset, NULL);
 
 		dset = &domainset_prefer[i];
 		DOMAINSET_COPY(&all_domains, &dset->ds_mask);
 		dset->ds_policy = DOMAINSET_POLICY_PREFER;
 		dset->ds_prefer = i;
 		_domainset_create(dset, NULL);
 	}
 }
 
 /*
  * Define the domainsets for cpuset 0, 1 and cpuset 2.
  */
 void
 domainset_zero(void)
 {
 	struct domainset *dset, *tmp;
 
 	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
 
 	domainset0 = &domainset_firsttouch;
 	curthread->td_domain.dr_policy = domainset0;
 
 	domainset2 = &domainset_interleave;
 	kernel_object->domain.dr_policy = domainset2;
 
 	/* Remove empty domains from the global policies. */
 	LIST_FOREACH_SAFE(dset, &cpuset_domains, ds_link, tmp)
 		if (domainset_empty_vm(dset))
 			LIST_REMOVE(dset, ds_link);
 }
 
 /*
  * Creates system-wide cpusets and the cpuset for thread0 including three
  * sets:
  * 
  * 0 - The root set which should represent all valid processors in the
  *     system.  This set is immutable.
  * 1 - The default set which all processes are a member of until changed.
  *     This allows an administrator to move all threads off of given cpus to
  *     dedicate them to high priority tasks or save power etc.
  * 2 - The kernel set which allows restriction and policy to be applied only
  *     to kernel threads and the kernel_object.
  */
 struct cpuset *
 cpuset_thread0(void)
 {
 	struct cpuset *set;
 	int i;
 	int error __unused;
 
 	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_CACHE, 0);
 	domainset_zone = uma_zcreate("domainset", sizeof(struct domainset),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
 
 	/*
 	 * Create the root system set (0) for the whole machine.  Doesn't use
 	 * cpuset_create() due to NULL parent.
 	 */
 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 	CPU_COPY(&all_cpus, &set->cs_mask);
 	LIST_INIT(&set->cs_children);
 	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
 	refcount_init(&set->cs_ref, 1);
 	set->cs_flags = CPU_SET_ROOT | CPU_SET_RDONLY;
 	set->cs_domain = domainset0;
 	cpuset_zero = set;
 	cpuset_root = &set->cs_mask;
 
 	/*
 	 * Now derive a default (1), modifiable set from that to give out.
 	 */
 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 	error = cpuset_init(set, cpuset_zero, NULL, NULL, 1);
 	KASSERT(error == 0, ("Error creating default set: %d\n", error));
 	cpuset_default = set;
 	/*
 	 * Create the kernel set (2).
 	 */
 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 	error = cpuset_init(set, cpuset_zero, NULL, NULL, 2);
 	KASSERT(error == 0, ("Error creating kernel set: %d\n", error));
 	set->cs_domain = domainset2;
 	cpuset_kernel = set;
 
 	/*
 	 * Initialize the unit allocator. 0 and 1 are allocated above.
 	 */
 	cpuset_unr = new_unrhdr(3, INT_MAX, NULL);
 
 	/*
 	 * If MD code has not initialized per-domain cpusets, place all
 	 * CPUs in domain 0.
 	 */
 	for (i = 0; i < MAXMEMDOM; i++)
 		if (!CPU_EMPTY(&cpuset_domain[i]))
 			goto domains_set;
 	CPU_COPY(&all_cpus, &cpuset_domain[0]);
 domains_set:
 
 	return (cpuset_default);
 }
 
 void
 cpuset_kernthread(struct thread *td)
 {
 	struct cpuset *set;
 
 	thread_lock(td);
 	set = td->td_cpuset;
 	td->td_cpuset = cpuset_ref(cpuset_kernel);
 	thread_unlock(td);
 	cpuset_rel(set);
 }
 
 /*
  * Create a cpuset, which would be cpuset_create() but
  * mark the new 'set' as root.
  *
  * We are not going to reparent the td to it.  Use cpuset_setproc_update_set()
  * for that.
  *
  * In case of no error, returns the set in *setp locked with a reference.
  */
 int
 cpuset_create_root(struct prison *pr, struct cpuset **setp)
 {
 	struct cpuset *set;
 	int error;
 
 	KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
 	KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
 
 	set = NULL;
 	error = cpuset_create(&set, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
 	if (error)
 		return (error);
 
 	KASSERT(set != NULL, ("[%s:%d] cpuset_create returned invalid data",
 	    __func__, __LINE__));
 
 	/* Mark the set as root. */
 	set->cs_flags |= CPU_SET_ROOT;
 	*setp = set;
 
 	return (0);
 }
 
 int
 cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
 {
 	int error;
 
 	KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
 	KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
 
 	cpuset_ref(set);
 	error = cpuset_setproc(p->p_pid, set, NULL, NULL, true);
 	if (error)
 		return (error);
 	cpuset_rel(set);
 	return (0);
 }
 
 /*
  * In Capability mode, the only accesses that are permitted are to the current
  * thread and process' CPU and domain sets.
  */
 static int
 cpuset_check_capabilities(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id)
 {
 	if (IN_CAPABILITY_MODE(td)) {
 		if (level != CPU_LEVEL_WHICH)
 			return (ECAPMODE);
 		if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 			return (ECAPMODE);
 		if (id != -1 &&
 		    !(which == CPU_WHICH_TID && id == td->td_tid) &&
 		    !(which == CPU_WHICH_PID && id == td->td_proc->p_pid))
 			return (ECAPMODE);
 	}
 	return (0);
 }
 
 #if defined(__powerpc__)
 /*
  * TODO: At least powerpc64 and powerpc64le kernels panic with
  * exception 0x480 (instruction segment exception) when copyin/copyout,
  * are set as a function pointer in cpuset_copy_cb struct and called by
  * an external module (like pfsync). Tip: copyin/copyout have an ifunc
  * resolver function.
  *
  * Bisect of LLVM shows that the behavior changed on LLVM 10.0 with
  * https://reviews.llvm.org/rGdc06b0bc9ad055d06535462d91bfc2a744b2f589
  *
  * This is a hack/workaround while problem is being discussed with LLVM
  * community
  */
 static int
 cpuset_copyin(const void *uaddr, void *kaddr, size_t len)
 {
 	return(copyin(uaddr, kaddr, len));
 }
 
 static int
 cpuset_copyout(const void *kaddr, void *uaddr, size_t len)
 {
 	return(copyout(kaddr, uaddr, len));
 }
 
 static const struct cpuset_copy_cb copy_set = {
 	.cpuset_copyin = cpuset_copyin,
 	.cpuset_copyout = cpuset_copyout
 };
 #else
 static const struct cpuset_copy_cb copy_set = {
         .cpuset_copyin = copyin,
         .cpuset_copyout = copyout
 };
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_args {
 	cpusetid_t	*setid;
 };
 #endif
 int
 sys_cpuset(struct thread *td, struct cpuset_args *uap)
 {
 	struct cpuset *root;
 	struct cpuset *set;
 	int error;
 
 	thread_lock(td);
 	root = cpuset_refroot(td->td_cpuset);
 	thread_unlock(td);
 	set = NULL;
 	error = cpuset_create(&set, root, &root->cs_mask);
 	cpuset_rel(root);
 	if (error)
 		return (error);
 	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
 	if (error == 0)
 		error = cpuset_setproc(-1, set, NULL, NULL, false);
 	cpuset_rel(set);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_setid_args {
 	cpuwhich_t	which;
 	id_t		id;
 	cpusetid_t	setid;
 };
 #endif
 int
 sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
 {
 
 	return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid));
 }
 
 int
 kern_cpuset_setid(struct thread *td, cpuwhich_t which,
     id_t id, cpusetid_t setid)
 {
 	struct cpuset *set;
 	int error;
 
 	/*
 	 * Presently we only support per-process sets.
 	 */
 	if (which != CPU_WHICH_PID)
 		return (EINVAL);
 	set = cpuset_lookup(setid, td);
 	if (set == NULL)
 		return (ESRCH);
 	error = cpuset_setproc(id, set, NULL, NULL, false);
 	cpuset_rel(set);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_getid_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	cpusetid_t	*setid;
 };
 #endif
 int
 sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
 {
 
 	return (kern_cpuset_getid(td, uap->level, uap->which, uap->id,
 	    uap->setid));
 }
 
 int
 kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, cpusetid_t *setid)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *ttd;
 	struct proc *p;
 	cpusetid_t tmpid;
 	int error;
 
 	if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET)
 		return (EINVAL);
 	error = cpuset_which(which, id, &p, &ttd, &set);
 	if (error)
 		return (error);
 	switch (which) {
 	case CPU_WHICH_TID:
 	case CPU_WHICH_PID:
 		thread_lock(ttd);
 		set = cpuset_refbase(ttd->td_cpuset);
 		thread_unlock(ttd);
 		PROC_UNLOCK(p);
 		break;
 	case CPU_WHICH_CPUSET:
 	case CPU_WHICH_JAIL:
 		break;
 	case CPU_WHICH_IRQ:
 	case CPU_WHICH_DOMAIN:
 		return (EINVAL);
 	}
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 		nset = cpuset_refroot(set);
 		cpuset_rel(set);
 		set = nset;
 		break;
 	case CPU_LEVEL_CPUSET:
 		break;
 	case CPU_LEVEL_WHICH:
 		break;
 	}
 	tmpid = set->cs_id;
 	cpuset_rel(set);
 	if (error == 0)
 		error = copyout(&tmpid, setid, sizeof(tmpid));
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_getaffinity_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		cpusetsize;
 	cpuset_t	*mask;
 };
 #endif
 int
 sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
 {
 
 	return (user_cpuset_getaffinity(td, uap->level, uap->which,
 	    uap->id, uap->cpusetsize, uap->mask, &copy_set));
 }
 
 int
 kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t cpusetsize, cpuset_t *mask)
 {
 	struct thread *ttd;
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct proc *p;
 	int error;
 
 	error = cpuset_check_capabilities(td, level, which, id);
 	if (error != 0)
 		return (error);
 	error = cpuset_which(which, id, &p, &ttd, &set);
 	if (error != 0)
 		return (error);
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			return (EINVAL);
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		CPU_COPY(&nset->cs_mask, mask);
 		cpuset_rel(nset);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			thread_lock(ttd);
 			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_PID:
 			FOREACH_THREAD_IN_PROC(p, ttd) {
 				thread_lock(ttd);
 				CPU_OR(mask, mask, &ttd->td_cpuset->cs_mask);
 				thread_unlock(ttd);
 			}
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			CPU_COPY(&set->cs_mask, mask);
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 			error = intr_getaffinity(id, which, mask);
 			break;
 		case CPU_WHICH_DOMAIN:
 			if (id < 0 || id >= MAXMEMDOM)
 				error = ESRCH;
 			else
 				CPU_COPY(&cpuset_domain[id], mask);
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (set)
 		cpuset_rel(set);
 	if (p)
 		PROC_UNLOCK(p);
 	if (error == 0) {
 		if (cpusetsize < howmany(CPU_FLS(mask), NBBY))
 			return (ERANGE);
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_STRUCT))
 			ktrcpuset(mask, cpusetsize);
 #endif
 	}
 	return (error);
 }
 
 int
 user_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t cpusetsize, cpuset_t *maskp, const struct cpuset_copy_cb *cb)
 {
 	cpuset_t *mask;
 	size_t size;
 	int error;
 
 	mask = malloc(sizeof(cpuset_t), M_TEMP, M_WAITOK | M_ZERO);
 	size = min(cpusetsize, sizeof(cpuset_t));
 	error = kern_cpuset_getaffinity(td, level, which, id, size, mask);
 	if (error == 0) {
 		error = cb->cpuset_copyout(mask, maskp, size);
 		if (error != 0)
 			goto out;
 		if (cpusetsize > size) {
 			char *end;
 			char *cp;
 			int rv;
 
 			end = cp = (char *)&maskp->__bits;
 			end += cpusetsize;
 			cp += size;
 			while (cp != end) {
 				rv = subyte(cp, 0);
 				if (rv == -1) {
 					error = EFAULT;
 					goto out;
 				}
 				cp++;
 			}
 		}
 	}
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_setaffinity_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		cpusetsize;
 	const cpuset_t	*mask;
 };
 #endif
 int
 sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
 {
 
 	return (user_cpuset_setaffinity(td, uap->level, uap->which,
 	    uap->id, uap->cpusetsize, uap->mask, &copy_set));
 }
 
 int
 kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, cpuset_t *mask)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *ttd;
 	struct proc *p;
 	int error;
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrcpuset(mask, sizeof(cpuset_t));
 #endif
 	error = cpuset_check_capabilities(td, level, which, id);
 	if (error != 0)
 		return (error);
 	if (CPU_EMPTY(mask))
 		return (EDEADLK);
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		error = cpuset_which(which, id, &p, &ttd, &set);
 		if (error)
 			break;
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			PROC_UNLOCK(p);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			return (EINVAL);
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		error = cpuset_modify(nset, mask);
 		cpuset_rel(nset);
 		cpuset_rel(set);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			error = cpuset_setthread(id, mask);
 			break;
 		case CPU_WHICH_PID:
 			error = cpuset_setproc(id, NULL, mask, NULL, false);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			error = cpuset_which(which, id, &p, &ttd, &set);
 			if (error == 0) {
 				error = cpuset_modify(set, mask);
 				cpuset_rel(set);
 			}
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 			error = intr_setaffinity(id, which, mask);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 int
 user_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t cpusetsize, const cpuset_t *maskp, const struct cpuset_copy_cb *cb)
 {
 	cpuset_t *mask;
 	int error;
 	size_t size;
 
 	size = min(cpusetsize, sizeof(cpuset_t));
 	mask = malloc(sizeof(cpuset_t), M_TEMP, M_WAITOK | M_ZERO);
 	error = cb->cpuset_copyin(maskp, mask, size);
 	if (error)
 		goto out;
 	/*
 	 * Verify that no high bits are set.
 	 */
 	if (cpusetsize > sizeof(cpuset_t)) {
 		const char *end, *cp;
 		int val;
 		end = cp = (const char *)&maskp->__bits;
 		end += cpusetsize;
 		cp += sizeof(cpuset_t);
 
 		while (cp != end) {
 			val = fubyte(cp);
 			if (val == -1) {
 				error = EFAULT;
 				goto out;
 			}
 			if (val != 0) {
 				error = EINVAL;
 				goto out;
 			}
 			cp++;
 		}
 	}
 	error = kern_cpuset_setaffinity(td, level, which, id, mask);
 
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_getdomain_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		domainsetsize;
 	domainset_t	*mask;
 	int 		*policy;
 };
 #endif
 int
 sys_cpuset_getdomain(struct thread *td, struct cpuset_getdomain_args *uap)
 {
 
 	return (kern_cpuset_getdomain(td, uap->level, uap->which,
 	    uap->id, uap->domainsetsize, uap->mask, uap->policy, &copy_set));
 }
 
 int
 kern_cpuset_getdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t domainsetsize, domainset_t *maskp, int *policyp,
     const struct cpuset_copy_cb *cb)
 {
 	struct domainset outset;
 	struct thread *ttd;
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct domainset *dset;
 	struct proc *p;
 	domainset_t *mask;
 	int error;
 
 	if (domainsetsize < sizeof(domainset_t) ||
 	    domainsetsize > DOMAINSET_MAXSIZE / NBBY)
 		return (ERANGE);
 	error = cpuset_check_capabilities(td, level, which, id);
 	if (error != 0)
 		return (error);
 	mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
 	bzero(&outset, sizeof(outset));
 	error = cpuset_which(which, id, &p, &ttd, &set);
 	if (error)
 		goto out;
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		domainset_copy(nset->cs_domain, &outset);
 		cpuset_rel(nset);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			thread_lock(ttd);
 			domainset_copy(ttd->td_cpuset->cs_domain, &outset);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_PID:
 			FOREACH_THREAD_IN_PROC(p, ttd) {
 				thread_lock(ttd);
 				dset = ttd->td_cpuset->cs_domain;
 				/* Show all domains in the proc. */
 				DOMAINSET_OR(&outset.ds_mask, &dset->ds_mask);
 				/* Last policy wins. */
 				outset.ds_policy = dset->ds_policy;
 				outset.ds_prefer = dset->ds_prefer;
 				thread_unlock(ttd);
 			}
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			domainset_copy(set->cs_domain, &outset);
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (set)
 		cpuset_rel(set);
 	if (p)
 		PROC_UNLOCK(p);
 	/*
 	 * Translate prefer into a set containing only the preferred domain,
 	 * not the entire fallback set.
 	 */
 	if (outset.ds_policy == DOMAINSET_POLICY_PREFER) {
 		DOMAINSET_ZERO(&outset.ds_mask);
 		DOMAINSET_SET(outset.ds_prefer, &outset.ds_mask);
 	}
 	DOMAINSET_COPY(&outset.ds_mask, mask);
 	if (error == 0)
 		error = cb->cpuset_copyout(mask, maskp, domainsetsize);
 	if (error == 0)
 		if (suword32(policyp, outset.ds_policy) != 0)
 			error = EFAULT;
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_setdomain_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		domainsetsize;
 	domainset_t	*mask;
 	int 		policy;
 };
 #endif
 int
 sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap)
 {
 
 	return (kern_cpuset_setdomain(td, uap->level, uap->which,
 	    uap->id, uap->domainsetsize, uap->mask, uap->policy, &copy_set));
 }
 
 int
 kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t domainsetsize, const domainset_t *maskp, int policy,
     const struct cpuset_copy_cb *cb)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *ttd;
 	struct proc *p;
 	struct domainset domain;
 	domainset_t *mask;
 	int error;
 
 	if (domainsetsize < sizeof(domainset_t) ||
 	    domainsetsize > DOMAINSET_MAXSIZE / NBBY)
 		return (ERANGE);
 	if (policy <= DOMAINSET_POLICY_INVALID ||
 	    policy > DOMAINSET_POLICY_MAX)
 		return (EINVAL);
 	error = cpuset_check_capabilities(td, level, which, id);
 	if (error != 0)
 		return (error);
 	memset(&domain, 0, sizeof(domain));
 	mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
 	error = cb->cpuset_copyin(maskp, mask, domainsetsize);
 	if (error)
 		goto out;
 	/*
 	 * Verify that no high bits are set.
 	 */
 	if (domainsetsize > sizeof(domainset_t)) {
 		char *end;
 		char *cp;
 
 		end = cp = (char *)&mask->__bits;
 		end += domainsetsize;
 		cp += sizeof(domainset_t);
 		while (cp != end)
 			if (*cp++ != 0) {
 				error = EINVAL;
 				goto out;
 			}
 	}
 	if (DOMAINSET_EMPTY(mask)) {
 		error = EDEADLK;
 		goto out;
 	}
 	DOMAINSET_COPY(mask, &domain.ds_mask);
 	domain.ds_policy = policy;
 
 	/*
 	 * Sanitize the provided mask.
 	 */
 	if (!DOMAINSET_SUBSET(&all_domains, &domain.ds_mask)) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/* Translate preferred policy into a mask and fallback. */
 	if (policy == DOMAINSET_POLICY_PREFER) {
 		/* Only support a single preferred domain. */
 		if (DOMAINSET_COUNT(&domain.ds_mask) != 1) {
 			error = EINVAL;
 			goto out;
 		}
 		domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1;
 		/* This will be constrained by domainset_shadow(). */
 		DOMAINSET_COPY(&all_domains, &domain.ds_mask);
 	}
 
 	/*
 	 * When given an impossible policy, fall back to interleaving
 	 * across all domains.
 	 */
 	if (domainset_empty_vm(&domain))
 		domainset_copy(domainset2, &domain);
 
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		error = cpuset_which(which, id, &p, &ttd, &set);
 		if (error)
 			break;
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			PROC_UNLOCK(p);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		error = cpuset_modify_domain(nset, &domain);
 		cpuset_rel(nset);
 		cpuset_rel(set);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			error = _cpuset_setthread(id, NULL, &domain);
 			break;
 		case CPU_WHICH_PID:
 			error = cpuset_setproc(id, NULL, NULL, &domain, false);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			error = cpuset_which(which, id, &p, &ttd, &set);
 			if (error == 0) {
 				error = cpuset_modify_domain(set, &domain);
 				cpuset_rel(set);
 			}
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifdef DDB
 
 static void
 ddb_display_bitset(const struct bitset *set, int size)
 {
 	int bit, once;
 
 	for (once = 0, bit = 0; bit < size; bit++) {
 		if (CPU_ISSET(bit, set)) {
 			if (once == 0) {
 				db_printf("%d", bit);
 				once = 1;
 			} else  
 				db_printf(",%d", bit);
 		}
 	}
 	if (once == 0)
 		db_printf("<none>");
 }
 
 void
 ddb_display_cpuset(const cpuset_t *set)
 {
 	ddb_display_bitset((const struct bitset *)set, CPU_SETSIZE);
 }
 
 static void
 ddb_display_domainset(const domainset_t *set)
 {
 	ddb_display_bitset((const struct bitset *)set, DOMAINSET_SETSIZE);
 }
 
 DB_SHOW_COMMAND(cpusets, db_show_cpusets)
 {
 	struct cpuset *set;
 
 	LIST_FOREACH(set, &cpuset_ids, cs_link) {
 		db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
 		    set, set->cs_id, refcount_load(&set->cs_ref), set->cs_flags,
 		    (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
 		db_printf("  cpu mask=");
 		ddb_display_cpuset(&set->cs_mask);
 		db_printf("\n");
 		db_printf("  domain policy %d prefer %d mask=",
 		    set->cs_domain->ds_policy, set->cs_domain->ds_prefer);
 		ddb_display_domainset(&set->cs_domain->ds_mask);
 		db_printf("\n");
 		if (db_pager_quit)
 			break;
 	}
 }
 
 DB_SHOW_COMMAND(domainsets, db_show_domainsets)
 {
 	struct domainset *set;
 
 	LIST_FOREACH(set, &cpuset_domains, ds_link) {
 		db_printf("set=%p policy %d prefer %d cnt %d\n",
 		    set, set->ds_policy, set->ds_prefer, set->ds_cnt);
 		db_printf("  mask =");
 		ddb_display_domainset(&set->ds_mask);
 		db_printf("\n");
 	}
 }
 #endif /* DDB */
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
index d66120666f9d..c83e05a4d87a 100644
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c
@@ -1,1342 +1,1344 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * This module holds the global variables and machine independent functions
  * used for the kernel SMP support.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/proc.h>
 #include <sys/bus.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
 #include "opt_sched.h"
 
 #ifdef SMP
 MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data");
 
 volatile cpuset_t stopped_cpus;
 volatile cpuset_t started_cpus;
 volatile cpuset_t suspended_cpus;
 cpuset_t hlt_cpus_mask;
 cpuset_t logical_cpus_mask;
 
 void (*cpustop_restartfunc)(void);
 #endif
 
 static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS);
 
 /* This is used in modules that need to work in both SMP and UP. */
 cpuset_t all_cpus;
 
 int mp_ncpus;
 /* export this for libkvm consumers. */
 int mp_maxcpus = MAXCPU;
 
 volatile int smp_started;
 u_int mp_maxid;
 
 static SYSCTL_NODE(_kern, OID_AUTO, smp,
     CTLFLAG_RD | CTLFLAG_CAPRD | CTLFLAG_MPSAFE, NULL,
     "Kernel SMP");
 
 SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0,
     "Max CPU ID.");
 
 SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus,
     0, "Max number of CPUs that the system was compiled for.");
 
 SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD|CTLTYPE_INT|CTLFLAG_MPSAFE,
     NULL, 0, sysctl_kern_smp_active, "I",
     "Indicates system is running in SMP mode");
 
 int smp_disabled = 0;	/* has smp been disabled? */
 SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
     &smp_disabled, 0, "SMP has been disabled from the loader");
 
 int smp_cpus = 1;	/* how many cpu's running */
 SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0,
     "Number of CPUs online");
 
 int smp_threads_per_core = 1;	/* how many SMT threads are running per core */
 SYSCTL_INT(_kern_smp, OID_AUTO, threads_per_core, CTLFLAG_RD|CTLFLAG_CAPRD,
     &smp_threads_per_core, 0, "Number of SMT threads online per core");
 
 int mp_ncores = -1;	/* how many physical cores running */
 SYSCTL_INT(_kern_smp, OID_AUTO, cores, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_ncores, 0,
     "Number of physical cores online");
 
 int smp_topology = 0;	/* Which topology we're using. */
 SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0,
     "Topology override setting; 0 is default provided by hardware.");
 
 #ifdef SMP
 /* Enable forwarding of a signal to a process running on a different CPU */
 static int forward_signal_enabled = 1;
 SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
 	   &forward_signal_enabled, 0,
 	   "Forwarding of a signal to a process on a different CPU");
 
 /* Variables needed for SMP rendezvous. */
 static volatile int smp_rv_ncpus;
 static void (*volatile smp_rv_setup_func)(void *arg);
 static void (*volatile smp_rv_action_func)(void *arg);
 static void (*volatile smp_rv_teardown_func)(void *arg);
 static void *volatile smp_rv_func_arg;
 static volatile int smp_rv_waiters[4];
 
 /* 
  * Shared mutex to restrict busywaits between smp_rendezvous() and
  * smp(_targeted)_tlb_shootdown().  A deadlock occurs if both of these
  * functions trigger at once and cause multiple CPUs to busywait with
  * interrupts disabled. 
  */
 struct mtx smp_ipi_mtx;
 
 /*
  * Let the MD SMP code initialize mp_maxid very early if it can.
  */
 static void
 mp_setmaxid(void *dummy)
 {
 
 	cpu_mp_setmaxid();
 
 	KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__));
 	KASSERT(mp_ncpus > 1 || mp_maxid == 0,
 	    ("%s: one CPU but mp_maxid is not zero", __func__));
 	KASSERT(mp_maxid >= mp_ncpus - 1,
 	    ("%s: counters out of sync: max %d, count %d", __func__,
 		mp_maxid, mp_ncpus));
+
+	cpusetsizemin = howmany(mp_maxid + 1, NBBY);
 }
 SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL);
 
 /*
  * Call the MD SMP initialization code.
  */
 static void
 mp_start(void *dummy)
 {
 
 	mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
 
 	/* Probe for MP hardware. */
 	if (smp_disabled != 0 || cpu_mp_probe() == 0) {
 		mp_ncores = 1;
 		mp_ncpus = 1;
 		CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
 		return;
 	}
 
 	cpu_mp_start();
 	printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
 	    mp_ncpus);
 
 	/* Provide a default for most architectures that don't have SMT/HTT. */
 	if (mp_ncores < 0)
 		mp_ncores = mp_ncpus;
 
 	cpu_mp_announce();
 }
 SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL);
 
 void
 forward_signal(struct thread *td)
 {
 	int id;
 
 	/*
 	 * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on
 	 * this thread, so all we need to do is poke it if it is currently
 	 * executing so that it executes ast().
 	 */
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_RUNNING(td),
 	    ("forward_signal: thread is not TDS_RUNNING"));
 
 	CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
 
 	if (!smp_started || cold || KERNEL_PANICKED())
 		return;
 	if (!forward_signal_enabled)
 		return;
 
 	/* No need to IPI ourself. */
 	if (td == curthread)
 		return;
 
 	id = td->td_oncpu;
 	if (id == NOCPU)
 		return;
 	ipi_cpu(id, IPI_AST);
 }
 
 /*
  * When called the executing CPU will send an IPI to all other CPUs
  *  requesting that they halt execution.
  *
  * Usually (but not necessarily) called with 'other_cpus' as its arg.
  *
  *  - Signals all CPUs in map to stop.
  *  - Waits for each to stop.
  *
  * Returns:
  *  -1: error
  *   0: NA
  *   1: ok
  *
  */
 #if defined(__amd64__) || defined(__i386__)
 #define	X86	1
 #else
 #define	X86	0
 #endif
 static int
 generic_stop_cpus(cpuset_t map, u_int type)
 {
 #ifdef KTR
 	char cpusetbuf[CPUSETBUFSIZ];
 #endif
 	static volatile u_int stopping_cpu = NOCPU;
 	int i;
 	volatile cpuset_t *cpus;
 
 	KASSERT(
 	    type == IPI_STOP || type == IPI_STOP_HARD
 #if X86
 	    || type == IPI_SUSPEND
 #endif
 	    , ("%s: invalid stop type", __func__));
 
 	if (!smp_started)
 		return (0);
 
 	CTR2(KTR_SMP, "stop_cpus(%s) with %u type",
 	    cpusetobj_strprint(cpusetbuf, &map), type);
 
 #if X86
 	/*
 	 * When suspending, ensure there are are no IPIs in progress.
 	 * IPIs that have been issued, but not yet delivered (e.g.
 	 * not pending on a vCPU when running under virtualization)
 	 * will be lost, violating FreeBSD's assumption of reliable
 	 * IPI delivery.
 	 */
 	if (type == IPI_SUSPEND)
 		mtx_lock_spin(&smp_ipi_mtx);
 #endif
 
 #if X86
 	if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
 #endif
 	if (stopping_cpu != PCPU_GET(cpuid))
 		while (atomic_cmpset_int(&stopping_cpu, NOCPU,
 		    PCPU_GET(cpuid)) == 0)
 			while (stopping_cpu != NOCPU)
 				cpu_spinwait(); /* spin */
 
 	/* send the stop IPI to all CPUs in map */
 	ipi_selected(map, type);
 #if X86
 	}
 #endif
 
 #if X86
 	if (type == IPI_SUSPEND)
 		cpus = &suspended_cpus;
 	else
 #endif
 		cpus = &stopped_cpus;
 
 	i = 0;
 	while (!CPU_SUBSET(cpus, &map)) {
 		/* spin */
 		cpu_spinwait();
 		i++;
 		if (i == 100000000) {
 			printf("timeout stopping cpus\n");
 			break;
 		}
 	}
 
 #if X86
 	if (type == IPI_SUSPEND)
 		mtx_unlock_spin(&smp_ipi_mtx);
 #endif
 
 	stopping_cpu = NOCPU;
 	return (1);
 }
 
 int
 stop_cpus(cpuset_t map)
 {
 
 	return (generic_stop_cpus(map, IPI_STOP));
 }
 
 int
 stop_cpus_hard(cpuset_t map)
 {
 
 	return (generic_stop_cpus(map, IPI_STOP_HARD));
 }
 
 #if X86
 int
 suspend_cpus(cpuset_t map)
 {
 
 	return (generic_stop_cpus(map, IPI_SUSPEND));
 }
 #endif
 
 /*
  * Called by a CPU to restart stopped CPUs. 
  *
  * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
  *
  *  - Signals all CPUs in map to restart.
  *  - Waits for each to restart.
  *
  * Returns:
  *  -1: error
  *   0: NA
  *   1: ok
  */
 static int
 generic_restart_cpus(cpuset_t map, u_int type)
 {
 #ifdef KTR
 	char cpusetbuf[CPUSETBUFSIZ];
 #endif
 	volatile cpuset_t *cpus;
 
 #if X86
 	KASSERT(type == IPI_STOP || type == IPI_STOP_HARD
 	    || type == IPI_SUSPEND, ("%s: invalid stop type", __func__));
 
 	if (!smp_started)
 		return (0);
 
 	CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
 
 	if (type == IPI_SUSPEND)
 		cpus = &resuming_cpus;
 	else
 		cpus = &stopped_cpus;
 
 	/* signal other cpus to restart */
 	if (type == IPI_SUSPEND)
 		CPU_COPY_STORE_REL(&map, &toresume_cpus);
 	else
 		CPU_COPY_STORE_REL(&map, &started_cpus);
 
 	/*
 	 * Wake up any CPUs stopped with MWAIT.  From MI code we can't tell if
 	 * MONITOR/MWAIT is enabled, but the potentially redundant writes are
 	 * relatively inexpensive.
 	 */
 	if (type == IPI_STOP) {
 		struct monitorbuf *mb;
 		u_int id;
 
 		CPU_FOREACH(id) {
 			if (!CPU_ISSET(id, &map))
 				continue;
 
 			mb = &pcpu_find(id)->pc_monitorbuf;
 			atomic_store_int(&mb->stop_state,
 			    MONITOR_STOPSTATE_RUNNING);
 		}
 	}
 
 	if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
 		/* wait for each to clear its bit */
 		while (CPU_OVERLAP(cpus, &map))
 			cpu_spinwait();
 	}
 #else /* !X86 */
 	KASSERT(type == IPI_STOP || type == IPI_STOP_HARD,
 	    ("%s: invalid stop type", __func__));
 
 	if (!smp_started)
 		return (0);
 
 	CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
 
 	cpus = &stopped_cpus;
 
 	/* signal other cpus to restart */
 	CPU_COPY_STORE_REL(&map, &started_cpus);
 
 	/* wait for each to clear its bit */
 	while (CPU_OVERLAP(cpus, &map))
 		cpu_spinwait();
 #endif
 	return (1);
 }
 
 int
 restart_cpus(cpuset_t map)
 {
 
 	return (generic_restart_cpus(map, IPI_STOP));
 }
 
 #if X86
 int
 resume_cpus(cpuset_t map)
 {
 
 	return (generic_restart_cpus(map, IPI_SUSPEND));
 }
 #endif
 #undef X86
 
 /*
  * All-CPU rendezvous.  CPUs are signalled, all execute the setup function 
  * (if specified), rendezvous, execute the action function (if specified),
  * rendezvous again, execute the teardown function (if specified), and then
  * resume.
  *
  * Note that the supplied external functions _must_ be reentrant and aware
  * that they are running in parallel and in an unknown lock context.
  */
 void
 smp_rendezvous_action(void)
 {
 	struct thread *td;
 	void *local_func_arg;
 	void (*local_setup_func)(void*);
 	void (*local_action_func)(void*);
 	void (*local_teardown_func)(void*);
 #ifdef INVARIANTS
 	int owepreempt;
 #endif
 
 	/* Ensure we have up-to-date values. */
 	atomic_add_acq_int(&smp_rv_waiters[0], 1);
 	while (smp_rv_waiters[0] < smp_rv_ncpus)
 		cpu_spinwait();
 
 	/* Fetch rendezvous parameters after acquire barrier. */
 	local_func_arg = smp_rv_func_arg;
 	local_setup_func = smp_rv_setup_func;
 	local_action_func = smp_rv_action_func;
 	local_teardown_func = smp_rv_teardown_func;
 
 	/*
 	 * Use a nested critical section to prevent any preemptions
 	 * from occurring during a rendezvous action routine.
 	 * Specifically, if a rendezvous handler is invoked via an IPI
 	 * and the interrupted thread was in the critical_exit()
 	 * function after setting td_critnest to 0 but before
 	 * performing a deferred preemption, this routine can be
 	 * invoked with td_critnest set to 0 and td_owepreempt true.
 	 * In that case, a critical_exit() during the rendezvous
 	 * action would trigger a preemption which is not permitted in
 	 * a rendezvous action.  To fix this, wrap all of the
 	 * rendezvous action handlers in a critical section.  We
 	 * cannot use a regular critical section however as having
 	 * critical_exit() preempt from this routine would also be
 	 * problematic (the preemption must not occur before the IPI
 	 * has been acknowledged via an EOI).  Instead, we
 	 * intentionally ignore td_owepreempt when leaving the
 	 * critical section.  This should be harmless because we do
 	 * not permit rendezvous action routines to schedule threads,
 	 * and thus td_owepreempt should never transition from 0 to 1
 	 * during this routine.
 	 */
 	td = curthread;
 	td->td_critnest++;
 #ifdef INVARIANTS
 	owepreempt = td->td_owepreempt;
 #endif
 
 	/*
 	 * If requested, run a setup function before the main action
 	 * function.  Ensure all CPUs have completed the setup
 	 * function before moving on to the action function.
 	 */
 	if (local_setup_func != smp_no_rendezvous_barrier) {
 		if (local_setup_func != NULL)
 			local_setup_func(local_func_arg);
 		atomic_add_int(&smp_rv_waiters[1], 1);
 		while (smp_rv_waiters[1] < smp_rv_ncpus)
                 	cpu_spinwait();
 	}
 
 	if (local_action_func != NULL)
 		local_action_func(local_func_arg);
 
 	if (local_teardown_func != smp_no_rendezvous_barrier) {
 		/*
 		 * Signal that the main action has been completed.  If a
 		 * full exit rendezvous is requested, then all CPUs will
 		 * wait here until all CPUs have finished the main action.
 		 */
 		atomic_add_int(&smp_rv_waiters[2], 1);
 		while (smp_rv_waiters[2] < smp_rv_ncpus)
 			cpu_spinwait();
 
 		if (local_teardown_func != NULL)
 			local_teardown_func(local_func_arg);
 	}
 
 	/*
 	 * Signal that the rendezvous is fully completed by this CPU.
 	 * This means that no member of smp_rv_* pseudo-structure will be
 	 * accessed by this target CPU after this point; in particular,
 	 * memory pointed by smp_rv_func_arg.
 	 *
 	 * The release semantic ensures that all accesses performed by
 	 * the current CPU are visible when smp_rendezvous_cpus()
 	 * returns, by synchronizing with the
 	 * atomic_load_acq_int(&smp_rv_waiters[3]).
 	 */
 	atomic_add_rel_int(&smp_rv_waiters[3], 1);
 
 	td->td_critnest--;
 	KASSERT(owepreempt == td->td_owepreempt,
 	    ("rendezvous action changed td_owepreempt"));
 }
 
 void
 smp_rendezvous_cpus(cpuset_t map,
 	void (* setup_func)(void *), 
 	void (* action_func)(void *),
 	void (* teardown_func)(void *),
 	void *arg)
 {
 	int curcpumap, i, ncpus = 0;
 
 	/* See comments in the !SMP case. */
 	if (!smp_started) {
 		spinlock_enter();
 		if (setup_func != NULL)
 			setup_func(arg);
 		if (action_func != NULL)
 			action_func(arg);
 		if (teardown_func != NULL)
 			teardown_func(arg);
 		spinlock_exit();
 		return;
 	}
 
 	/*
 	 * Make sure we come here with interrupts enabled.  Otherwise we
 	 * livelock if smp_ipi_mtx is owned by a thread which sent us an IPI.
 	 */
 	MPASS(curthread->td_md.md_spinlock_count == 0);
 
 	CPU_FOREACH(i) {
 		if (CPU_ISSET(i, &map))
 			ncpus++;
 	}
 	if (ncpus == 0)
 		panic("ncpus is 0 with non-zero map");
 
 	mtx_lock_spin(&smp_ipi_mtx);
 
 	/* Pass rendezvous parameters via global variables. */
 	smp_rv_ncpus = ncpus;
 	smp_rv_setup_func = setup_func;
 	smp_rv_action_func = action_func;
 	smp_rv_teardown_func = teardown_func;
 	smp_rv_func_arg = arg;
 	smp_rv_waiters[1] = 0;
 	smp_rv_waiters[2] = 0;
 	smp_rv_waiters[3] = 0;
 	atomic_store_rel_int(&smp_rv_waiters[0], 0);
 
 	/*
 	 * Signal other processors, which will enter the IPI with
 	 * interrupts off.
 	 */
 	curcpumap = CPU_ISSET(curcpu, &map);
 	CPU_CLR(curcpu, &map);
 	ipi_selected(map, IPI_RENDEZVOUS);
 
 	/* Check if the current CPU is in the map */
 	if (curcpumap != 0)
 		smp_rendezvous_action();
 
 	/*
 	 * Ensure that the master CPU waits for all the other
 	 * CPUs to finish the rendezvous, so that smp_rv_*
 	 * pseudo-structure and the arg are guaranteed to not
 	 * be in use.
 	 *
 	 * Load acquire synchronizes with the release add in
 	 * smp_rendezvous_action(), which ensures that our caller sees
 	 * all memory actions done by the called functions on other
 	 * CPUs.
 	 */
 	while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
 		cpu_spinwait();
 
 	mtx_unlock_spin(&smp_ipi_mtx);
 }
 
 void
 smp_rendezvous(void (* setup_func)(void *), 
 	       void (* action_func)(void *),
 	       void (* teardown_func)(void *),
 	       void *arg)
 {
 	smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
 }
 
 static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1];
 
 static void
 smp_topo_fill(struct cpu_group *cg)
 {
 	int c;
 
 	for (c = 0; c < cg->cg_children; c++)
 		smp_topo_fill(&cg->cg_child[c]);
 	cg->cg_first = CPU_FFS(&cg->cg_mask) - 1;
 	cg->cg_last = CPU_FLS(&cg->cg_mask) - 1;
 }
 
 struct cpu_group *
 smp_topo(void)
 {
 	char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 	struct cpu_group *top;
 
 	/*
 	 * Check for a fake topology request for debugging purposes.
 	 */
 	switch (smp_topology) {
 	case 1:
 		/* Dual core with no sharing.  */
 		top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
 		break;
 	case 2:
 		/* No topology, all cpus are equal. */
 		top = smp_topo_none();
 		break;
 	case 3:
 		/* Dual core with shared L2.  */
 		top = smp_topo_1level(CG_SHARE_L2, 2, 0);
 		break;
 	case 4:
 		/* quad core, shared l3 among each package, private l2.  */
 		top = smp_topo_1level(CG_SHARE_L3, 4, 0);
 		break;
 	case 5:
 		/* quad core,  2 dualcore parts on each package share l2.  */
 		top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
 		break;
 	case 6:
 		/* Single-core 2xHTT */
 		top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
 		break;
 	case 7:
 		/* quad core with a shared l3, 8 threads sharing L2.  */
 		top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
 		    CG_FLAG_SMT);
 		break;
 	default:
 		/* Default, ask the system what it wants. */
 		top = cpu_topo();
 		break;
 	}
 	/*
 	 * Verify the returned topology.
 	 */
 	if (top->cg_count != mp_ncpus)
 		panic("Built bad topology at %p.  CPU count %d != %d",
 		    top, top->cg_count, mp_ncpus);
 	if (CPU_CMP(&top->cg_mask, &all_cpus))
 		panic("Built bad topology at %p.  CPU mask (%s) != (%s)",
 		    top, cpusetobj_strprint(cpusetbuf, &top->cg_mask),
 		    cpusetobj_strprint(cpusetbuf2, &all_cpus));
 
 	/*
 	 * Collapse nonsense levels that may be created out of convenience by
 	 * the MD layers.  They cause extra work in the search functions.
 	 */
 	while (top->cg_children == 1) {
 		top = &top->cg_child[0];
 		top->cg_parent = NULL;
 	}
 	smp_topo_fill(top);
 	return (top);
 }
 
 struct cpu_group *
 smp_topo_alloc(u_int count)
 {
 	static u_int index;
 	u_int curr;
 
 	curr = index;
 	index += count;
 	return (&group[curr]);
 }
 
 struct cpu_group *
 smp_topo_none(void)
 {
 	struct cpu_group *top;
 
 	top = &group[0];
 	top->cg_parent = NULL;
 	top->cg_child = NULL;
 	top->cg_mask = all_cpus;
 	top->cg_count = mp_ncpus;
 	top->cg_children = 0;
 	top->cg_level = CG_SHARE_NONE;
 	top->cg_flags = 0;
 
 	return (top);
 }
 
 static int
 smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share,
     int count, int flags, int start)
 {
 	char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 	cpuset_t mask;
 	int i;
 
 	CPU_ZERO(&mask);
 	for (i = 0; i < count; i++, start++)
 		CPU_SET(start, &mask);
 	child->cg_parent = parent;
 	child->cg_child = NULL;
 	child->cg_children = 0;
 	child->cg_level = share;
 	child->cg_count = count;
 	child->cg_flags = flags;
 	child->cg_mask = mask;
 	parent->cg_children++;
 	for (; parent != NULL; parent = parent->cg_parent) {
 		if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask))
 			panic("Duplicate children in %p.  mask (%s) child (%s)",
 			    parent,
 			    cpusetobj_strprint(cpusetbuf, &parent->cg_mask),
 			    cpusetobj_strprint(cpusetbuf2, &child->cg_mask));
 		CPU_OR(&parent->cg_mask, &parent->cg_mask, &child->cg_mask);
 		parent->cg_count += child->cg_count;
 	}
 
 	return (start);
 }
 
 struct cpu_group *
 smp_topo_1level(int share, int count, int flags)
 {
 	struct cpu_group *child;
 	struct cpu_group *top;
 	int packages;
 	int cpu;
 	int i;
 
 	cpu = 0;
 	top = &group[0];
 	packages = mp_ncpus / count;
 	top->cg_child = child = &group[1];
 	top->cg_level = CG_SHARE_NONE;
 	for (i = 0; i < packages; i++, child++)
 		cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
 	return (top);
 }
 
 struct cpu_group *
 smp_topo_2level(int l2share, int l2count, int l1share, int l1count,
     int l1flags)
 {
 	struct cpu_group *top;
 	struct cpu_group *l1g;
 	struct cpu_group *l2g;
 	int cpu;
 	int i;
 	int j;
 
 	cpu = 0;
 	top = &group[0];
 	l2g = &group[1];
 	top->cg_child = l2g;
 	top->cg_level = CG_SHARE_NONE;
 	top->cg_children = mp_ncpus / (l2count * l1count);
 	l1g = l2g + top->cg_children;
 	for (i = 0; i < top->cg_children; i++, l2g++) {
 		l2g->cg_parent = top;
 		l2g->cg_child = l1g;
 		l2g->cg_level = l2share;
 		for (j = 0; j < l2count; j++, l1g++)
 			cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
 			    l1flags, cpu);
 	}
 	return (top);
 }
 
 struct cpu_group *
 smp_topo_find(struct cpu_group *top, int cpu)
 {
 	struct cpu_group *cg;
 	cpuset_t mask;
 	int children;
 	int i;
 
 	CPU_SETOF(cpu, &mask);
 	cg = top;
 	for (;;) {
 		if (!CPU_OVERLAP(&cg->cg_mask, &mask))
 			return (NULL);
 		if (cg->cg_children == 0)
 			return (cg);
 		children = cg->cg_children;
 		for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
 			if (CPU_OVERLAP(&cg->cg_mask, &mask))
 				break;
 	}
 	return (NULL);
 }
 #else /* !SMP */
 
 void
 smp_rendezvous_cpus(cpuset_t map,
 	void (*setup_func)(void *), 
 	void (*action_func)(void *),
 	void (*teardown_func)(void *),
 	void *arg)
 {
 	/*
 	 * In the !SMP case we just need to ensure the same initial conditions
 	 * as the SMP case.
 	 */
 	spinlock_enter();
 	if (setup_func != NULL)
 		setup_func(arg);
 	if (action_func != NULL)
 		action_func(arg);
 	if (teardown_func != NULL)
 		teardown_func(arg);
 	spinlock_exit();
 }
 
 void
 smp_rendezvous(void (*setup_func)(void *), 
 	       void (*action_func)(void *),
 	       void (*teardown_func)(void *),
 	       void *arg)
 {
 
 	smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func,
 	    arg);
 }
 
 /*
  * Provide dummy SMP support for UP kernels.  Modules that need to use SMP
  * APIs will still work using this dummy support.
  */
 static void
 mp_setvariables_for_up(void *dummy)
 {
 	mp_ncpus = 1;
 	mp_ncores = 1;
 	mp_maxid = PCPU_GET(cpuid);
 	CPU_SETOF(mp_maxid, &all_cpus);
 	KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
 }
 SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
     mp_setvariables_for_up, NULL);
 #endif /* SMP */
 
 void
 smp_no_rendezvous_barrier(void *dummy)
 {
 #ifdef SMP
 	KASSERT((!smp_started),("smp_no_rendezvous called and smp is started"));
 #endif
 }
 
 void
 smp_rendezvous_cpus_retry(cpuset_t map,
 	void (* setup_func)(void *),
 	void (* action_func)(void *),
 	void (* teardown_func)(void *),
 	void (* wait_func)(void *, int),
 	struct smp_rendezvous_cpus_retry_arg *arg)
 {
 	int cpu;
 
 	CPU_COPY(&map, &arg->cpus);
 
 	/*
 	 * Only one CPU to execute on.
 	 */
 	if (!smp_started) {
 		spinlock_enter();
 		if (setup_func != NULL)
 			setup_func(arg);
 		if (action_func != NULL)
 			action_func(arg);
 		if (teardown_func != NULL)
 			teardown_func(arg);
 		spinlock_exit();
 		return;
 	}
 
 	/*
 	 * Execute an action on all specified CPUs while retrying until they
 	 * all acknowledge completion.
 	 */
 	for (;;) {
 		smp_rendezvous_cpus(
 		    arg->cpus,
 		    setup_func,
 		    action_func,
 		    teardown_func,
 		    arg);
 
 		if (CPU_EMPTY(&arg->cpus))
 			break;
 
 		CPU_FOREACH(cpu) {
 			if (!CPU_ISSET(cpu, &arg->cpus))
 				continue;
 			wait_func(arg, cpu);
 		}
 	}
 }
 
 void
 smp_rendezvous_cpus_done(struct smp_rendezvous_cpus_retry_arg *arg)
 {
 
 	CPU_CLR_ATOMIC(curcpu, &arg->cpus);
 }
 
 /*
  * If (prio & PDROP) == 0:
  * Wait for specified idle threads to switch once.  This ensures that even
  * preempted threads have cycled through the switch function once,
  * exiting their codepaths.  This allows us to change global pointers
  * with no other synchronization.
  * If (prio & PDROP) != 0:
  * Force the specified CPUs to switch context at least once.
  */
 int
 quiesce_cpus(cpuset_t map, const char *wmesg, int prio)
 {
 	struct pcpu *pcpu;
 	u_int *gen;
 	int error;
 	int cpu;
 
 	error = 0;
 	if ((prio & PDROP) == 0) {
 		gen = malloc(sizeof(u_int) * MAXCPU, M_TEMP, M_WAITOK);
 		for (cpu = 0; cpu <= mp_maxid; cpu++) {
 			if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
 				continue;
 			pcpu = pcpu_find(cpu);
 			gen[cpu] = pcpu->pc_idlethread->td_generation;
 		}
 	}
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
 			continue;
 		pcpu = pcpu_find(cpu);
 		thread_lock(curthread);
 		sched_bind(curthread, cpu);
 		thread_unlock(curthread);
 		if ((prio & PDROP) != 0)
 			continue;
 		while (gen[cpu] == pcpu->pc_idlethread->td_generation) {
 			error = tsleep(quiesce_cpus, prio & ~PDROP, wmesg, 1);
 			if (error != EWOULDBLOCK)
 				goto out;
 			error = 0;
 		}
 	}
 out:
 	thread_lock(curthread);
 	sched_unbind(curthread);
 	thread_unlock(curthread);
 	if ((prio & PDROP) == 0)
 		free(gen, M_TEMP);
 
 	return (error);
 }
 
 int
 quiesce_all_cpus(const char *wmesg, int prio)
 {
 
 	return quiesce_cpus(all_cpus, wmesg, prio);
 }
 
 /*
  * Observe all CPUs not executing in critical section.
  * We are not in one so the check for us is safe. If the found
  * thread changes to something else we know the section was
  * exited as well.
  */
 void
 quiesce_all_critical(void)
 {
 	struct thread *td, *newtd;
 	struct pcpu *pcpu;
 	int cpu;
 
 	MPASS(curthread->td_critnest == 0);
 
 	CPU_FOREACH(cpu) {
 		pcpu = cpuid_to_pcpu[cpu];
 		td = pcpu->pc_curthread;
 		for (;;) {
 			if (td->td_critnest == 0)
 				break;
 			cpu_spinwait();
 			newtd = (struct thread *)
 			    atomic_load_acq_ptr((void *)pcpu->pc_curthread);
 			if (td != newtd)
 				break;
 		}
 	}
 }
 
 static void
 cpus_fence_seq_cst_issue(void *arg __unused)
 {
 
 	atomic_thread_fence_seq_cst();
 }
 
 /*
  * Send an IPI forcing a sequentially consistent fence.
  *
  * Allows replacement of an explicitly fence with a compiler barrier.
  * Trades speed up during normal execution for a significant slowdown when
  * the barrier is needed.
  */
 void
 cpus_fence_seq_cst(void)
 {
 
 #ifdef SMP
 	smp_rendezvous(
 	    smp_no_rendezvous_barrier,
 	    cpus_fence_seq_cst_issue,
 	    smp_no_rendezvous_barrier,
 	    NULL
 	);
 #else
 	cpus_fence_seq_cst_issue(NULL);
 #endif
 }
 
 /* Extra care is taken with this sysctl because the data type is volatile */
 static int
 sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS)
 {
 	int error, active;
 
 	active = smp_started;
 	error = SYSCTL_OUT(req, &active, sizeof(active));
 	return (error);
 }
 
 #ifdef SMP
 void
 topo_init_node(struct topo_node *node)
 {
 
 	bzero(node, sizeof(*node));
 	TAILQ_INIT(&node->children);
 }
 
 void
 topo_init_root(struct topo_node *root)
 {
 
 	topo_init_node(root);
 	root->type = TOPO_TYPE_SYSTEM;
 }
 
 /*
  * Add a child node with the given ID under the given parent.
  * Do nothing if there is already a child with that ID.
  */
 struct topo_node *
 topo_add_node_by_hwid(struct topo_node *parent, int hwid,
     topo_node_type type, uintptr_t subtype)
 {
 	struct topo_node *node;
 
 	TAILQ_FOREACH_REVERSE(node, &parent->children,
 	    topo_children, siblings) {
 		if (node->hwid == hwid
 		    && node->type == type && node->subtype == subtype) {
 			return (node);
 		}
 	}
 
 	node = malloc(sizeof(*node), M_TOPO, M_WAITOK);
 	topo_init_node(node);
 	node->parent = parent;
 	node->hwid = hwid;
 	node->type = type;
 	node->subtype = subtype;
 	TAILQ_INSERT_TAIL(&parent->children, node, siblings);
 	parent->nchildren++;
 
 	return (node);
 }
 
 /*
  * Find a child node with the given ID under the given parent.
  */
 struct topo_node *
 topo_find_node_by_hwid(struct topo_node *parent, int hwid,
     topo_node_type type, uintptr_t subtype)
 {
 
 	struct topo_node *node;
 
 	TAILQ_FOREACH(node, &parent->children, siblings) {
 		if (node->hwid == hwid
 		    && node->type == type && node->subtype == subtype) {
 			return (node);
 		}
 	}
 
 	return (NULL);
 }
 
 /*
  * Given a node change the order of its parent's child nodes such
  * that the node becomes the firt child while preserving the cyclic
  * order of the children.  In other words, the given node is promoted
  * by rotation.
  */
 void
 topo_promote_child(struct topo_node *child)
 {
 	struct topo_node *next;
 	struct topo_node *node;
 	struct topo_node *parent;
 
 	parent = child->parent;
 	next = TAILQ_NEXT(child, siblings);
 	TAILQ_REMOVE(&parent->children, child, siblings);
 	TAILQ_INSERT_HEAD(&parent->children, child, siblings);
 
 	while (next != NULL) {
 		node = next;
 		next = TAILQ_NEXT(node, siblings);
 		TAILQ_REMOVE(&parent->children, node, siblings);
 		TAILQ_INSERT_AFTER(&parent->children, child, node, siblings);
 		child = node;
 	}
 }
 
 /*
  * Iterate to the next node in the depth-first search (traversal) of
  * the topology tree.
  */
 struct topo_node *
 topo_next_node(struct topo_node *top, struct topo_node *node)
 {
 	struct topo_node *next;
 
 	if ((next = TAILQ_FIRST(&node->children)) != NULL)
 		return (next);
 
 	if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 		return (next);
 
 	while (node != top && (node = node->parent) != top)
 		if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 			return (next);
 
 	return (NULL);
 }
 
 /*
  * Iterate to the next node in the depth-first search of the topology tree,
  * but without descending below the current node.
  */
 struct topo_node *
 topo_next_nonchild_node(struct topo_node *top, struct topo_node *node)
 {
 	struct topo_node *next;
 
 	if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 		return (next);
 
 	while (node != top && (node = node->parent) != top)
 		if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 			return (next);
 
 	return (NULL);
 }
 
 /*
  * Assign the given ID to the given topology node that represents a logical
  * processor.
  */
 void
 topo_set_pu_id(struct topo_node *node, cpuid_t id)
 {
 
 	KASSERT(node->type == TOPO_TYPE_PU,
 	    ("topo_set_pu_id: wrong node type: %u", node->type));
 	KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0,
 	    ("topo_set_pu_id: cpuset already not empty"));
 	node->id = id;
 	CPU_SET(id, &node->cpuset);
 	node->cpu_count = 1;
 	node->subtype = 1;
 
 	while ((node = node->parent) != NULL) {
 		KASSERT(!CPU_ISSET(id, &node->cpuset),
 		    ("logical ID %u is already set in node %p", id, node));
 		CPU_SET(id, &node->cpuset);
 		node->cpu_count++;
 	}
 }
 
 static struct topology_spec {
 	topo_node_type	type;
 	bool		match_subtype;
 	uintptr_t	subtype;
 } topology_level_table[TOPO_LEVEL_COUNT] = {
 	[TOPO_LEVEL_PKG] = { .type = TOPO_TYPE_PKG, },
 	[TOPO_LEVEL_GROUP] = { .type = TOPO_TYPE_GROUP, },
 	[TOPO_LEVEL_CACHEGROUP] = {
 		.type = TOPO_TYPE_CACHE,
 		.match_subtype = true,
 		.subtype = CG_SHARE_L3,
 	},
 	[TOPO_LEVEL_CORE] = { .type = TOPO_TYPE_CORE, },
 	[TOPO_LEVEL_THREAD] = { .type = TOPO_TYPE_PU, },
 };
 
 static bool
 topo_analyze_table(struct topo_node *root, int all, enum topo_level level,
     struct topo_analysis *results)
 {
 	struct topology_spec *spec;
 	struct topo_node *node;
 	int count;
 
 	if (level >= TOPO_LEVEL_COUNT)
 		return (true);
 
 	spec = &topology_level_table[level];
 	count = 0;
 	node = topo_next_node(root, root);
 
 	while (node != NULL) {
 		if (node->type != spec->type ||
 		    (spec->match_subtype && node->subtype != spec->subtype)) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 		if (!all && CPU_EMPTY(&node->cpuset)) {
 			node = topo_next_nonchild_node(root, node);
 			continue;
 		}
 
 		count++;
 
 		if (!topo_analyze_table(node, all, level + 1, results))
 			return (false);
 
 		node = topo_next_nonchild_node(root, node);
 	}
 
 	/* No explicit subgroups is essentially one subgroup. */
 	if (count == 0) {
 		count = 1;
 
 		if (!topo_analyze_table(root, all, level + 1, results))
 			return (false);
 	}
 
 	if (results->entities[level] == -1)
 		results->entities[level] = count;
 	else if (results->entities[level] != count)
 		return (false);
 
 	return (true);
 }
 
 /*
  * Check if the topology is uniform, that is, each package has the same number
  * of cores in it and each core has the same number of threads (logical
  * processors) in it.  If so, calculate the number of packages, the number of
  * groups per package, the number of cachegroups per group, and the number of
  * logical processors per cachegroup.  'all' parameter tells whether to include
  * administratively disabled logical processors into the analysis.
  */
 int
 topo_analyze(struct topo_node *topo_root, int all,
     struct topo_analysis *results)
 {
 
 	results->entities[TOPO_LEVEL_PKG] = -1;
 	results->entities[TOPO_LEVEL_CORE] = -1;
 	results->entities[TOPO_LEVEL_THREAD] = -1;
 	results->entities[TOPO_LEVEL_GROUP] = -1;
 	results->entities[TOPO_LEVEL_CACHEGROUP] = -1;
 
 	if (!topo_analyze_table(topo_root, all, TOPO_LEVEL_PKG, results))
 		return (0);
 
 	KASSERT(results->entities[TOPO_LEVEL_PKG] > 0,
 		("bug in topology or analysis"));
 
 	return (1);
 }
 
 #endif /* SMP */
diff --git a/sys/sys/cpuset.h b/sys/sys/cpuset.h
index 601da08a46a8..f8fc36b99aa7 100644
--- a/sys/sys/cpuset.h
+++ b/sys/sys/cpuset.h
@@ -1,190 +1,191 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2008,	Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Copyright (c) 2008 Nokia Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_CPUSET_H_
 #define	_SYS_CPUSET_H_
 
 #include <sys/_cpuset.h>
 
 #include <sys/_bitset.h>
 #include <sys/bitset.h>
 
 #define	_NCPUBITS	_BITSET_BITS
 #define	_NCPUWORDS	__bitset_words(CPU_SETSIZE)
 
 #define	CPUSETBUFSIZ	((2 + sizeof(long) * 2) * _NCPUWORDS)
 
 #define	CPU_CLR(n, p)			__BIT_CLR(CPU_SETSIZE, n, p)
 #define	CPU_COPY(f, t)			__BIT_COPY(CPU_SETSIZE, f, t)
 #define	CPU_ISSET(n, p)			__BIT_ISSET(CPU_SETSIZE, n, p)
 #define	CPU_SET(n, p)			__BIT_SET(CPU_SETSIZE, n, p)
 #define	CPU_ZERO(p) 			__BIT_ZERO(CPU_SETSIZE, p)
 #define	CPU_FILL(p) 			__BIT_FILL(CPU_SETSIZE, p)
 #define	CPU_SETOF(n, p)			__BIT_SETOF(CPU_SETSIZE, n, p)
 #define	CPU_EQUAL(p, c)			(__BIT_CMP(CPU_SETSIZE, p, c) == 0)
 #define	CPU_EMPTY(p)			__BIT_EMPTY(CPU_SETSIZE, p)
 #define	CPU_ISFULLSET(p)		__BIT_ISFULLSET(CPU_SETSIZE, p)
 #define	CPU_SUBSET(p, c)		__BIT_SUBSET(CPU_SETSIZE, p, c)
 #define	CPU_OVERLAP(p, c)		__BIT_OVERLAP(CPU_SETSIZE, p, c)
 #define	CPU_CMP(p, c)			__BIT_CMP(CPU_SETSIZE, p, c)
 #define	CPU_OR(d, s1, s2)		__BIT_OR2(CPU_SETSIZE, d, s1, s2)
 #define	CPU_AND(d, s1, s2)		__BIT_AND2(CPU_SETSIZE, d, s1, s2)
 #define	CPU_ANDNOT(d, s1, s2)		__BIT_ANDNOT2(CPU_SETSIZE, d, s1, s2)
 #define	CPU_XOR(d, s1, s2)		__BIT_XOR2(CPU_SETSIZE, d, s1, s2)
 #define	CPU_CLR_ATOMIC(n, p)		__BIT_CLR_ATOMIC(CPU_SETSIZE, n, p)
 #define	CPU_SET_ATOMIC(n, p)		__BIT_SET_ATOMIC(CPU_SETSIZE, n, p)
 #define	CPU_SET_ATOMIC_ACQ(n, p)	__BIT_SET_ATOMIC_ACQ(CPU_SETSIZE, n, p)
 #define	CPU_AND_ATOMIC(n, p)		__BIT_AND_ATOMIC(CPU_SETSIZE, n, p)
 #define	CPU_OR_ATOMIC(d, s)		__BIT_OR_ATOMIC(CPU_SETSIZE, d, s)
 #define	CPU_COPY_STORE_REL(f, t)	__BIT_COPY_STORE_REL(CPU_SETSIZE, f, t)
 #define	CPU_FFS(p)			__BIT_FFS(CPU_SETSIZE, p)
 #define	CPU_FLS(p)			__BIT_FLS(CPU_SETSIZE, p)
 #define	CPU_FOREACH_ISSET(i, p)		__BIT_FOREACH_ISSET(CPU_SETSIZE, i, p)
 #define	CPU_FOREACH_ISCLR(i, p)		__BIT_FOREACH_ISCLR(CPU_SETSIZE, i, p)
 #define	CPU_COUNT(p)			((int)__BIT_COUNT(CPU_SETSIZE, p))
 #define	CPUSET_FSET			__BITSET_FSET(_NCPUWORDS)
 #define	CPUSET_T_INITIALIZER(x)		__BITSET_T_INITIALIZER(x)
 
 #define CPU_ALLOC_SIZE(_s)		__BITSET_SIZE(_s)
 #define CPU_ALLOC(_s)			__cpuset_alloc(_s)
 #define CPU_FREE(p)			__cpuset_free(p)
 
 #define CPU_ISSET_S(n, _s, p)		__BIT_ISSET((_s) * 8, n, p)
 #define CPU_SET_S(n, _s, p)		__BIT_SET((_s) * 8, n, p)
 #define CPU_CLR_S(n, _s, p)		__BIT_CLR((_s) * 8, n, p)
 #define CPU_ZERO_S(_s, p)		__BIT_ZERO((_s) * 8, p)
 
 #define	CPU_OR_S(_s, d, s1, s2)		__BIT_OR2((_s) * 8, d, s1, s2)
 #define	CPU_AND_S(_s, d, s1, s2)	__BIT_AND2((_s) * 8, d, s1, s2)
 #define	CPU_XOR_S(_s, d, s1, s2)	__BIT_XOR2((_s) * 8, d, s1, s2)
 
 #define	CPU_COUNT_S(_s, p)		((int)__BIT_COUNT((_s) * 8, p))
 #define	CPU_EQUAL_S(_s, p, c)		(__BIT_CMP((_s) * 8, p, c) == 0)
 
 /*
  * Valid cpulevel_t values.
  */
 #define	CPU_LEVEL_ROOT		1	/* All system cpus. */
 #define	CPU_LEVEL_CPUSET	2	/* Available cpus for which. */
 #define	CPU_LEVEL_WHICH		3	/* Actual mask/id for which. */
 
 /*
  * Valid cpuwhich_t values.
  */
 #define	CPU_WHICH_TID		1	/* Specifies a thread id. */
 #define	CPU_WHICH_PID		2	/* Specifies a process id. */
 #define	CPU_WHICH_CPUSET	3	/* Specifies a set id. */
 #define	CPU_WHICH_IRQ		4	/* Specifies an irq #. */
 #define	CPU_WHICH_JAIL		5	/* Specifies a jail id. */
 #define	CPU_WHICH_DOMAIN	6	/* Specifies a NUMA domain id. */
 #define	CPU_WHICH_INTRHANDLER	7	/* Specifies an irq # (not ithread). */
 #define	CPU_WHICH_ITHREAD	8	/* Specifies an irq's ithread. */
 
 /*
  * Reserved cpuset identifiers.
  */
 #define	CPUSET_INVALID	-1
 #define	CPUSET_DEFAULT	0
 
 #ifdef _KERNEL
 #include <sys/queue.h>
 
 LIST_HEAD(setlist, cpuset);
+extern u_int cpusetsizemin;
 
 /*
  * cpusets encapsulate cpu binding information for one or more threads.
  *
  * 	a - Accessed with atomics.
  *	s - Set at creation, never modified.  Only a ref required to read.
  *	c - Locked internally by a cpuset lock.
  *
  * The bitmask is only modified while holding the cpuset lock.  It may be
  * read while only a reference is held but the consumer must be prepared
  * to deal with inconsistent results.
  */
 struct cpuset {
 	volatile u_int		cs_ref;		/* (a) Reference count. */
 	int			cs_flags;	/* (s) Flags from below. */
 	LIST_ENTRY(cpuset)	cs_link;	/* (c) All identified sets. */
 	LIST_ENTRY(cpuset)	cs_siblings;	/* (c) Sibling set link. */
 	struct setlist		cs_children;	/* (c) List of children. */
 	struct domainset	*cs_domain;	/* (c) NUMA policy. */
 	cpusetid_t		cs_id;		/* (s) Id or INVALID. */
 	struct cpuset		*cs_parent;	/* (s) Pointer to our parent. */
 	cpuset_t		cs_mask;	/* bitmask of valid cpus. */
 };
 
 #define CPU_SET_ROOT    0x0001  /* Set is a root set. */
 #define CPU_SET_RDONLY  0x0002  /* No modification allowed. */
 
 extern cpuset_t *cpuset_root;
 struct prison;
 struct proc;
 struct thread;
 
 /*
  * Callbacks for copying in/out a cpuset or domainset.  Used for alternate
  * ABIs, like compat32.
  */
 struct cpuset_copy_cb {
 	int (*cpuset_copyin)(const void *, void *, size_t);
 	int (*cpuset_copyout)(const void *, void *, size_t);
 };
 
 struct cpuset *cpuset_thread0(void);
 struct cpuset *cpuset_ref(struct cpuset *);
 void	cpuset_rel(struct cpuset *);
 int	cpuset_setthread(lwpid_t id, cpuset_t *);
 int	cpuset_setithread(lwpid_t id, int cpu);
 int	cpuset_create_root(struct prison *, struct cpuset **);
 int	cpuset_setproc_update_set(struct proc *, struct cpuset *);
 int	cpuset_which(cpuwhich_t, id_t, struct proc **,
 	    struct thread **, struct cpuset **);
 void	cpuset_kernthread(struct thread *);
 
 char	*cpusetobj_strprint(char *, const cpuset_t *);
 int	cpusetobj_strscan(cpuset_t *, const char *);
 #ifdef DDB
 void	ddb_display_cpuset(const cpuset_t *);
 #endif
 
 #else
 __BEGIN_DECLS
 int	cpuset(cpusetid_t *);
 int	cpuset_setid(cpuwhich_t, id_t, cpusetid_t);
 int	cpuset_getid(cpulevel_t, cpuwhich_t, id_t, cpusetid_t *);
 int	cpuset_getaffinity(cpulevel_t, cpuwhich_t, id_t, size_t, cpuset_t *);
 int	cpuset_setaffinity(cpulevel_t, cpuwhich_t, id_t, size_t, const cpuset_t *);
 __END_DECLS
 #endif
 #endif /* !_SYS_CPUSET_H_ */