diff --git a/lib/libthr/thread/thr_attr.c b/lib/libthr/thread/thr_attr.c index 6ff23aa5a3da..5a06f793f4f8 100644 --- a/lib/libthr/thread/thr_attr.c +++ b/lib/libthr/thread/thr_attr.c @@ -1,678 +1,680 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2003 Craig Rodrigues . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Craig Rodrigues. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY CRAIG RODRIGUES AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Copyright (c) 1998 Daniel Eischen . * Copyright (C) 2001 Jason Evans . * Copyright (c) 2002,2003 Alexey Zelkin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer * unmodified other than the allowable addition of one or more * copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1996 John Birrell . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "namespace.h" #include #include #include #include #include #include #include "un-namespace.h" #include "thr_private.h" static size_t _get_kern_cpuset_size(void); __weak_reference(_thr_attr_destroy, _pthread_attr_destroy); __weak_reference(_thr_attr_destroy, pthread_attr_destroy); int _thr_attr_destroy(pthread_attr_t *attr) { int ret; /* Check for invalid arguments: */ if (attr == NULL || *attr == NULL) /* Invalid argument: */ ret = EINVAL; else { if ((*attr)->cpuset != NULL) free((*attr)->cpuset); /* Free the memory allocated to the attribute object: */ free(*attr); /* * Leave the attribute pointer NULL now that the memory * has been freed: */ *attr = NULL; ret = 0; } return (ret); } __weak_reference(_thr_attr_get_np, pthread_attr_get_np); __weak_reference(_thr_attr_get_np, _pthread_attr_get_np); int _thr_attr_get_np(pthread_t pthread, pthread_attr_t *dstattr) { struct pthread *curthread; struct pthread_attr attr, *dst; int ret; size_t kern_size; if (pthread == NULL || dstattr == NULL || (dst = *dstattr) == NULL) return (EINVAL); kern_size = _get_kern_cpuset_size(); if (dst->cpuset == NULL) { dst->cpuset = calloc(1, kern_size); dst->cpusetsize = kern_size; } curthread = _get_curthread(); if ((ret = _thr_find_thread(curthread, pthread, /*include dead*/0)) != 0) return (ret); attr = pthread->attr; if (pthread->flags & THR_FLAGS_DETACHED) attr.flags |= PTHREAD_DETACHED; ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, TID(pthread), dst->cpusetsize, dst->cpuset); if (ret == -1) ret = errno; THR_THREAD_UNLOCK(curthread, pthread); if (ret == 0) { memcpy(&dst->pthread_attr_start_copy, &attr.pthread_attr_start_copy, offsetof(struct pthread_attr, pthread_attr_end_copy) - offsetof(struct pthread_attr, pthread_attr_start_copy)); } return (ret); } __weak_reference(_thr_attr_getdetachstate, pthread_attr_getdetachstate); __weak_reference(_thr_attr_getdetachstate, _pthread_attr_getdetachstate); int _thr_attr_getdetachstate(const pthread_attr_t *attr, int *detachstate) { int ret; /* Check for invalid arguments: */ if (attr == NULL || *attr == NULL || detachstate == NULL) ret = EINVAL; else { /* Check if the detached flag is set: */ if ((*attr)->flags & PTHREAD_DETACHED) /* Return detached: */ *detachstate = PTHREAD_CREATE_DETACHED; else /* Return joinable: */ *detachstate = PTHREAD_CREATE_JOINABLE; ret = 0; } return (ret); } __weak_reference(_thr_attr_getguardsize, pthread_attr_getguardsize); __weak_reference(_thr_attr_getguardsize, _pthread_attr_getguardsize); int _thr_attr_getguardsize(const pthread_attr_t * __restrict attr, size_t * __restrict guardsize) { int ret; /* Check for invalid arguments: */ if (attr == NULL || *attr == NULL || guardsize == NULL) ret = EINVAL; else { /* Return the guard size: */ *guardsize = (*attr)->guardsize_attr; ret = 0; } return (ret); } __weak_reference(_thr_attr_getinheritsched, pthread_attr_getinheritsched); __weak_reference(_thr_attr_getinheritsched, _pthread_attr_getinheritsched); int _thr_attr_getinheritsched(const pthread_attr_t * __restrict attr, int * __restrict sched_inherit) { int ret = 0; if ((attr == NULL) || (*attr == NULL)) ret = EINVAL; else *sched_inherit = (*attr)->sched_inherit; return (ret); } __weak_reference(_thr_attr_getschedparam, pthread_attr_getschedparam); __weak_reference(_thr_attr_getschedparam, _pthread_attr_getschedparam); int _thr_attr_getschedparam(const pthread_attr_t * __restrict attr, struct sched_param * __restrict param) { int ret = 0; if ((attr == NULL) || (*attr == NULL) || (param == NULL)) ret = EINVAL; else param->sched_priority = (*attr)->prio; return (ret); } __weak_reference(_thr_attr_getschedpolicy, pthread_attr_getschedpolicy); __weak_reference(_thr_attr_getschedpolicy, _pthread_attr_getschedpolicy); int _thr_attr_getschedpolicy(const pthread_attr_t * __restrict attr, int * __restrict policy) { int ret = 0; if ((attr == NULL) || (*attr == NULL) || (policy == NULL)) ret = EINVAL; else *policy = (*attr)->sched_policy; return (ret); } __weak_reference(_thr_attr_getscope, pthread_attr_getscope); __weak_reference(_thr_attr_getscope, _pthread_attr_getscope); int _thr_attr_getscope(const pthread_attr_t * __restrict attr, int * __restrict contentionscope) { int ret = 0; if ((attr == NULL) || (*attr == NULL) || (contentionscope == NULL)) /* Return an invalid argument: */ ret = EINVAL; else *contentionscope = (*attr)->flags & PTHREAD_SCOPE_SYSTEM ? PTHREAD_SCOPE_SYSTEM : PTHREAD_SCOPE_PROCESS; return (ret); } __weak_reference(_pthread_attr_getstack, pthread_attr_getstack); int _pthread_attr_getstack(const pthread_attr_t * __restrict attr, void ** __restrict stackaddr, size_t * __restrict stacksize) { int ret; /* Check for invalid arguments: */ if (attr == NULL || *attr == NULL || stackaddr == NULL || stacksize == NULL ) ret = EINVAL; else { /* Return the stack address and size */ *stackaddr = (*attr)->stackaddr_attr; *stacksize = (*attr)->stacksize_attr; ret = 0; } return (ret); } __weak_reference(_thr_attr_getstackaddr, pthread_attr_getstackaddr); __weak_reference(_thr_attr_getstackaddr, _pthread_attr_getstackaddr); int _thr_attr_getstackaddr(const pthread_attr_t *attr, void **stackaddr) { int ret; /* Check for invalid arguments: */ if (attr == NULL || *attr == NULL || stackaddr == NULL) ret = EINVAL; else { /* Return the stack address: */ *stackaddr = (*attr)->stackaddr_attr; ret = 0; } return (ret); } __weak_reference(_thr_attr_getstacksize, pthread_attr_getstacksize); __weak_reference(_thr_attr_getstacksize, _pthread_attr_getstacksize); int _thr_attr_getstacksize(const pthread_attr_t * __restrict attr, size_t * __restrict stacksize) { int ret; /* Check for invalid arguments: */ if (attr == NULL || *attr == NULL || stacksize == NULL) ret = EINVAL; else { /* Return the stack size: */ *stacksize = (*attr)->stacksize_attr; ret = 0; } return (ret); } __weak_reference(_thr_attr_init, pthread_attr_init); __weak_reference(_thr_attr_init, _pthread_attr_init); int _thr_attr_init(pthread_attr_t *attr) { int ret; pthread_attr_t pattr; _thr_check_init(); /* Allocate memory for the attribute object: */ if ((pattr = (pthread_attr_t) malloc(sizeof(struct pthread_attr))) == NULL) /* Insufficient memory: */ ret = ENOMEM; else { /* Initialise the attribute object with the defaults: */ memcpy(pattr, &_pthread_attr_default, sizeof(struct pthread_attr)); /* Return a pointer to the attribute object: */ *attr = pattr; ret = 0; } return (ret); } __weak_reference(_pthread_attr_setcreatesuspend_np, pthread_attr_setcreatesuspend_np); int _pthread_attr_setcreatesuspend_np(pthread_attr_t *attr) { int ret; if (attr == NULL || *attr == NULL) { ret = EINVAL; } else { (*attr)->suspend = THR_CREATE_SUSPENDED; ret = 0; } return (ret); } __weak_reference(_thr_attr_setdetachstate, pthread_attr_setdetachstate); __weak_reference(_thr_attr_setdetachstate, _pthread_attr_setdetachstate); int _thr_attr_setdetachstate(pthread_attr_t *attr, int detachstate) { int ret; /* Check for invalid arguments: */ if (attr == NULL || *attr == NULL || (detachstate != PTHREAD_CREATE_DETACHED && detachstate != PTHREAD_CREATE_JOINABLE)) ret = EINVAL; else { /* Check if detached state: */ if (detachstate == PTHREAD_CREATE_DETACHED) /* Set the detached flag: */ (*attr)->flags |= PTHREAD_DETACHED; else /* Reset the detached flag: */ (*attr)->flags &= ~PTHREAD_DETACHED; ret = 0; } return (ret); } __weak_reference(_thr_attr_setguardsize, pthread_attr_setguardsize); __weak_reference(_thr_attr_setguardsize, _pthread_attr_setguardsize); int _thr_attr_setguardsize(pthread_attr_t *attr, size_t guardsize) { int ret; /* Check for invalid arguments. */ if (attr == NULL || *attr == NULL) ret = EINVAL; else { /* Save the stack size. */ (*attr)->guardsize_attr = guardsize; ret = 0; } return (ret); } __weak_reference(_thr_attr_setinheritsched, pthread_attr_setinheritsched); __weak_reference(_thr_attr_setinheritsched, _pthread_attr_setinheritsched); int _thr_attr_setinheritsched(pthread_attr_t *attr, int sched_inherit) { int ret = 0; if ((attr == NULL) || (*attr == NULL)) ret = EINVAL; else if (sched_inherit != PTHREAD_INHERIT_SCHED && sched_inherit != PTHREAD_EXPLICIT_SCHED) ret = ENOTSUP; else (*attr)->sched_inherit = sched_inherit; return (ret); } __weak_reference(_thr_attr_setschedparam, pthread_attr_setschedparam); __weak_reference(_thr_attr_setschedparam, _pthread_attr_setschedparam); int _thr_attr_setschedparam(pthread_attr_t * __restrict attr, const struct sched_param * __restrict param) { int policy; if ((attr == NULL) || (*attr == NULL)) return (EINVAL); if (param == NULL) return (ENOTSUP); policy = (*attr)->sched_policy; if (policy == SCHED_FIFO || policy == SCHED_RR) { if (param->sched_priority < _thr_priorities[policy-1].pri_min || param->sched_priority > _thr_priorities[policy-1].pri_max) return (ENOTSUP); } else { /* * Ignore it for SCHED_OTHER now, patches for glib ports * are wrongly using M:N thread library's internal macro * THR_MIN_PRIORITY and THR_MAX_PRIORITY. */ } (*attr)->prio = param->sched_priority; return (0); } __weak_reference(_thr_attr_setschedpolicy, pthread_attr_setschedpolicy); __weak_reference(_thr_attr_setschedpolicy, _pthread_attr_setschedpolicy); int _thr_attr_setschedpolicy(pthread_attr_t *attr, int policy) { int ret = 0; if ((attr == NULL) || (*attr == NULL)) ret = EINVAL; else if ((policy < SCHED_FIFO) || (policy > SCHED_RR)) { ret = ENOTSUP; } else { (*attr)->sched_policy = policy; (*attr)->prio = _thr_priorities[policy-1].pri_default; } return (ret); } __weak_reference(_thr_attr_setscope, pthread_attr_setscope); __weak_reference(_thr_attr_setscope, _pthread_attr_setscope); int _thr_attr_setscope(pthread_attr_t *attr, int contentionscope) { int ret = 0; if ((attr == NULL) || (*attr == NULL)) { /* Return an invalid argument: */ ret = EINVAL; } else if ((contentionscope != PTHREAD_SCOPE_PROCESS) && (contentionscope != PTHREAD_SCOPE_SYSTEM)) { ret = EINVAL; } else if (contentionscope == PTHREAD_SCOPE_SYSTEM) { (*attr)->flags |= contentionscope; } else { (*attr)->flags &= ~PTHREAD_SCOPE_SYSTEM; } return (ret); } __weak_reference(_pthread_attr_setstack, pthread_attr_setstack); int _pthread_attr_setstack(pthread_attr_t *attr, void *stackaddr, size_t stacksize) { int ret; /* Check for invalid arguments: */ if (attr == NULL || *attr == NULL || stackaddr == NULL || stacksize < PTHREAD_STACK_MIN) ret = EINVAL; else { /* Save the stack address and stack size */ (*attr)->stackaddr_attr = stackaddr; (*attr)->stacksize_attr = stacksize; ret = 0; } return (ret); } __weak_reference(_thr_attr_setstackaddr, pthread_attr_setstackaddr); __weak_reference(_thr_attr_setstackaddr, _pthread_attr_setstackaddr); int _thr_attr_setstackaddr(pthread_attr_t *attr, void *stackaddr) { int ret; /* Check for invalid arguments: */ if (attr == NULL || *attr == NULL || stackaddr == NULL) ret = EINVAL; else { /* Save the stack address: */ (*attr)->stackaddr_attr = stackaddr; ret = 0; } return(ret); } __weak_reference(_thr_attr_setstacksize, pthread_attr_setstacksize); __weak_reference(_thr_attr_setstacksize, _pthread_attr_setstacksize); int _thr_attr_setstacksize(pthread_attr_t *attr, size_t stacksize) { int ret; /* Check for invalid arguments: */ if (attr == NULL || *attr == NULL || stacksize < PTHREAD_STACK_MIN) ret = EINVAL; else { /* Save the stack size: */ (*attr)->stacksize_attr = stacksize; ret = 0; } return (ret); } static size_t _get_kern_cpuset_size(void) { static int kern_cpuset_size = 0; if (kern_cpuset_size == 0) { size_t len; len = sizeof(kern_cpuset_size); - if (sysctlbyname("kern.sched.cpusetsize", &kern_cpuset_size, - &len, NULL, 0)) + if (sysctlbyname("kern.sched.cpusetsizemin", &kern_cpuset_size, + &len, NULL, 0) != 0 && + sysctlbyname("kern.sched.cpusetsize", &kern_cpuset_size, + &len, NULL, 0) != 0) PANIC("failed to get sysctl kern.sched.cpusetsize"); } return (kern_cpuset_size); } __weak_reference(_pthread_attr_setaffinity_np, pthread_attr_setaffinity_np); int _pthread_attr_setaffinity_np(pthread_attr_t *pattr, size_t cpusetsize, const cpuset_t *cpusetp) { pthread_attr_t attr; int ret; if (pattr == NULL || (attr = (*pattr)) == NULL) ret = EINVAL; else { if (cpusetsize == 0 || cpusetp == NULL) { if (attr->cpuset != NULL) { free(attr->cpuset); attr->cpuset = NULL; attr->cpusetsize = 0; } return (0); } size_t kern_size = _get_kern_cpuset_size(); /* Kernel rejects small set, we check it here too. */ if (cpusetsize < kern_size) return (ERANGE); if (cpusetsize > kern_size) { /* Kernel checks invalid bits, we check it here too. */ size_t i; for (i = kern_size; i < cpusetsize; ++i) { if (((const char *)cpusetp)[i]) return (EINVAL); } } if (attr->cpuset == NULL) { attr->cpuset = calloc(1, kern_size); if (attr->cpuset == NULL) return (errno); attr->cpusetsize = kern_size; } memcpy(attr->cpuset, cpusetp, kern_size); ret = 0; } return (ret); } __weak_reference(_pthread_attr_getaffinity_np, pthread_attr_getaffinity_np); int _pthread_attr_getaffinity_np(const pthread_attr_t *pattr, size_t cpusetsize, cpuset_t *cpusetp) { pthread_attr_t attr; int ret = 0; if (pattr == NULL || (attr = (*pattr)) == NULL) ret = EINVAL; else { /* Kernel rejects small set, we check it here too. */ size_t kern_size = _get_kern_cpuset_size(); if (cpusetsize < kern_size) return (ERANGE); if (attr->cpuset != NULL) memcpy(cpusetp, attr->cpuset, MIN(cpusetsize, attr->cpusetsize)); else memset(cpusetp, -1, kern_size); if (cpusetsize > kern_size) memset(((char *)cpusetp) + kern_size, 0, cpusetsize - kern_size); } return (ret); } diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c index affc48e78862..d2bbfff1337d 100644 --- a/sys/kern/kern_cpuset.c +++ b/sys/kern/kern_cpuset.c @@ -1,2552 +1,2558 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008, Jeffrey Roberson * All rights reserved. * * Copyright (c) 2008 Nokia Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* DDB */ /* * cpusets provide a mechanism for creating and manipulating sets of * processors for the purpose of constraining the scheduling of threads to * specific processors. * * Each process belongs to an identified set, by default this is set 1. Each * thread may further restrict the cpus it may run on to a subset of this * named set. This creates an anonymous set which other threads and processes * may not join by number. * * The named set is referred to herein as the 'base' set to avoid ambiguity. * This set is usually a child of a 'root' set while the anonymous set may * simply be referred to as a mask. In the syscall api these are referred to * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here. * * Threads inherit their set from their creator whether it be anonymous or * not. This means that anonymous sets are immutable because they may be * shared. To modify an anonymous set a new set is created with the desired * mask and the same parent as the existing anonymous set. This gives the * illusion of each thread having a private mask. * * Via the syscall apis a user may ask to retrieve or modify the root, base, * or mask that is discovered via a pid, tid, or setid. Modifying a set * modifies all numbered and anonymous child sets to comply with the new mask. * Modifying a pid or tid's mask applies only to that tid but must still * exist within the assigned parent set. * * A thread may not be assigned to a group separate from other threads in * the process. This is to remove ambiguity when the setid is queried with * a pid argument. There is no other technical limitation. * * This somewhat complex arrangement is intended to make it easy for * applications to query available processors and bind their threads to * specific processors while also allowing administrators to dynamically * reprovision by changing sets which apply to groups of processes. * * A simple application should not concern itself with sets at all and * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id * meaning 'curthread'. It may query available cpus for that tid with a * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...). */ LIST_HEAD(domainlist, domainset); struct domainset __read_mostly domainset_firsttouch; struct domainset __read_mostly domainset_fixed[MAXMEMDOM]; struct domainset __read_mostly domainset_interleave; struct domainset __read_mostly domainset_prefer[MAXMEMDOM]; struct domainset __read_mostly domainset_roundrobin; static uma_zone_t cpuset_zone; static uma_zone_t domainset_zone; static struct mtx cpuset_lock; static struct setlist cpuset_ids; static struct domainlist cpuset_domains; static struct unrhdr *cpuset_unr; static struct cpuset *cpuset_zero, *cpuset_default, *cpuset_kernel; static struct domainset *domainset0, *domainset2; +u_int cpusetsizemin = 1; /* Return the size of cpuset_t at the kernel level */ SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)"); +/* Return the minimum size of cpuset_t allowed by the kernel */ +SYSCTL_UINT(_kern_sched, OID_AUTO, cpusetsizemin, + CTLFLAG_RD | CTLFLAG_CAPRD, &cpusetsizemin, 0, + "The minimum size of cpuset_t allowed by the kernel"); + cpuset_t *cpuset_root; cpuset_t cpuset_domain[MAXMEMDOM]; static int domainset_valid(const struct domainset *, const struct domainset *); /* * Find the first non-anonymous set starting from 'set'. */ static struct cpuset * cpuset_getbase(struct cpuset *set) { if (set->cs_id == CPUSET_INVALID) set = set->cs_parent; return (set); } /* * Walks up the tree from 'set' to find the root. */ static struct cpuset * cpuset_getroot(struct cpuset *set) { while ((set->cs_flags & CPU_SET_ROOT) == 0 && set->cs_parent != NULL) set = set->cs_parent; return (set); } /* * Acquire a reference to a cpuset, all pointers must be tracked with refs. */ struct cpuset * cpuset_ref(struct cpuset *set) { refcount_acquire(&set->cs_ref); return (set); } /* * Walks up the tree from 'set' to find the root. Returns the root * referenced. */ static struct cpuset * cpuset_refroot(struct cpuset *set) { return (cpuset_ref(cpuset_getroot(set))); } /* * Find the first non-anonymous set starting from 'set'. Returns this set * referenced. May return the passed in set with an extra ref if it is * not anonymous. */ static struct cpuset * cpuset_refbase(struct cpuset *set) { return (cpuset_ref(cpuset_getbase(set))); } /* * Release a reference in a context where it is safe to allocate. */ void cpuset_rel(struct cpuset *set) { cpusetid_t id; if (refcount_release_if_not_last(&set->cs_ref)) return; mtx_lock_spin(&cpuset_lock); if (!refcount_release(&set->cs_ref)) { mtx_unlock_spin(&cpuset_lock); return; } LIST_REMOVE(set, cs_siblings); id = set->cs_id; if (id != CPUSET_INVALID) LIST_REMOVE(set, cs_link); mtx_unlock_spin(&cpuset_lock); cpuset_rel(set->cs_parent); uma_zfree(cpuset_zone, set); if (id != CPUSET_INVALID) free_unr(cpuset_unr, id); } /* * Deferred release must be used when in a context that is not safe to * allocate/free. This places any unreferenced sets on the list 'head'. */ static void cpuset_rel_defer(struct setlist *head, struct cpuset *set) { if (refcount_release_if_not_last(&set->cs_ref)) return; mtx_lock_spin(&cpuset_lock); if (!refcount_release(&set->cs_ref)) { mtx_unlock_spin(&cpuset_lock); return; } LIST_REMOVE(set, cs_siblings); if (set->cs_id != CPUSET_INVALID) LIST_REMOVE(set, cs_link); LIST_INSERT_HEAD(head, set, cs_link); mtx_unlock_spin(&cpuset_lock); } /* * Complete a deferred release. Removes the set from the list provided to * cpuset_rel_defer. */ static void cpuset_rel_complete(struct cpuset *set) { cpusetid_t id; id = set->cs_id; LIST_REMOVE(set, cs_link); cpuset_rel(set->cs_parent); uma_zfree(cpuset_zone, set); if (id != CPUSET_INVALID) free_unr(cpuset_unr, id); } /* * Find a set based on an id. Returns it with a ref. */ static struct cpuset * cpuset_lookup(cpusetid_t setid, struct thread *td) { struct cpuset *set; if (setid == CPUSET_INVALID) return (NULL); mtx_lock_spin(&cpuset_lock); LIST_FOREACH(set, &cpuset_ids, cs_link) if (set->cs_id == setid) break; if (set) cpuset_ref(set); mtx_unlock_spin(&cpuset_lock); KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__)); if (set != NULL && jailed(td->td_ucred)) { struct cpuset *jset, *tset; jset = td->td_ucred->cr_prison->pr_cpuset; for (tset = set; tset != NULL; tset = tset->cs_parent) if (tset == jset) break; if (tset == NULL) { cpuset_rel(set); set = NULL; } } return (set); } /* * Initialize a set in the space provided in 'set' with the provided parameters. * The set is returned with a single ref. May return EDEADLK if the set * will have no valid cpu based on restrictions from the parent. */ static int cpuset_init(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask, struct domainset *domain, cpusetid_t id) { if (domain == NULL) domain = parent->cs_domain; if (mask == NULL) mask = &parent->cs_mask; if (!CPU_OVERLAP(&parent->cs_mask, mask)) return (EDEADLK); /* The domain must be prepared ahead of time. */ if (!domainset_valid(parent->cs_domain, domain)) return (EDEADLK); CPU_COPY(mask, &set->cs_mask); LIST_INIT(&set->cs_children); refcount_init(&set->cs_ref, 1); set->cs_flags = 0; mtx_lock_spin(&cpuset_lock); set->cs_domain = domain; CPU_AND(&set->cs_mask, &set->cs_mask, &parent->cs_mask); set->cs_id = id; set->cs_parent = cpuset_ref(parent); LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings); if (set->cs_id != CPUSET_INVALID) LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); mtx_unlock_spin(&cpuset_lock); return (0); } /* * Create a new non-anonymous set with the requested parent and mask. May * return failures if the mask is invalid or a new number can not be * allocated. * * If *setp is not NULL, then it will be used as-is. The caller must take * into account that *setp will be inserted at the head of cpuset_ids and * plan any potentially conflicting cs_link usage accordingly. */ static int cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask) { struct cpuset *set; cpusetid_t id; int error; bool dofree; id = alloc_unr(cpuset_unr); if (id == -1) return (ENFILE); dofree = (*setp == NULL); if (*setp != NULL) set = *setp; else *setp = set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); error = cpuset_init(set, parent, mask, NULL, id); if (error == 0) return (0); free_unr(cpuset_unr, id); if (dofree) uma_zfree(cpuset_zone, set); return (error); } static void cpuset_freelist_add(struct setlist *list, int count) { struct cpuset *set; int i; for (i = 0; i < count; i++) { set = uma_zalloc(cpuset_zone, M_ZERO | M_WAITOK); LIST_INSERT_HEAD(list, set, cs_link); } } static void cpuset_freelist_init(struct setlist *list, int count) { LIST_INIT(list); cpuset_freelist_add(list, count); } static void cpuset_freelist_free(struct setlist *list) { struct cpuset *set; while ((set = LIST_FIRST(list)) != NULL) { LIST_REMOVE(set, cs_link); uma_zfree(cpuset_zone, set); } } static void domainset_freelist_add(struct domainlist *list, int count) { struct domainset *set; int i; for (i = 0; i < count; i++) { set = uma_zalloc(domainset_zone, M_ZERO | M_WAITOK); LIST_INSERT_HEAD(list, set, ds_link); } } static void domainset_freelist_init(struct domainlist *list, int count) { LIST_INIT(list); domainset_freelist_add(list, count); } static void domainset_freelist_free(struct domainlist *list) { struct domainset *set; while ((set = LIST_FIRST(list)) != NULL) { LIST_REMOVE(set, ds_link); uma_zfree(domainset_zone, set); } } /* Copy a domainset preserving mask and policy. */ static void domainset_copy(const struct domainset *from, struct domainset *to) { DOMAINSET_COPY(&from->ds_mask, &to->ds_mask); to->ds_policy = from->ds_policy; to->ds_prefer = from->ds_prefer; } /* Return 1 if mask and policy are equal, otherwise 0. */ static int domainset_equal(const struct domainset *one, const struct domainset *two) { return (DOMAINSET_CMP(&one->ds_mask, &two->ds_mask) == 0 && one->ds_policy == two->ds_policy && one->ds_prefer == two->ds_prefer); } /* Return 1 if child is a valid subset of parent. */ static int domainset_valid(const struct domainset *parent, const struct domainset *child) { if (child->ds_policy != DOMAINSET_POLICY_PREFER) return (DOMAINSET_SUBSET(&parent->ds_mask, &child->ds_mask)); return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask)); } static int domainset_restrict(const struct domainset *parent, const struct domainset *child) { if (child->ds_policy != DOMAINSET_POLICY_PREFER) return (DOMAINSET_OVERLAP(&parent->ds_mask, &child->ds_mask)); return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask)); } /* * Lookup or create a domainset. The key is provided in ds_mask and * ds_policy. If the domainset does not yet exist the storage in * 'domain' is used to insert. Otherwise this storage is freed to the * domainset_zone and the existing domainset is returned. */ static struct domainset * _domainset_create(struct domainset *domain, struct domainlist *freelist) { struct domainset *ndomain; int i, j; KASSERT(domain->ds_cnt <= vm_ndomains, ("invalid domain count in domainset %p", domain)); KASSERT(domain->ds_policy != DOMAINSET_POLICY_PREFER || domain->ds_prefer < vm_ndomains, ("invalid preferred domain in domains %p", domain)); mtx_lock_spin(&cpuset_lock); LIST_FOREACH(ndomain, &cpuset_domains, ds_link) if (domainset_equal(ndomain, domain)) break; /* * If the domain does not yet exist we insert it and initialize * various iteration helpers which are not part of the key. */ if (ndomain == NULL) { LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link); domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask); for (i = 0, j = 0; i < DOMAINSET_FLS(&domain->ds_mask); i++) if (DOMAINSET_ISSET(i, &domain->ds_mask)) domain->ds_order[j++] = i; } mtx_unlock_spin(&cpuset_lock); if (ndomain == NULL) return (domain); if (freelist != NULL) LIST_INSERT_HEAD(freelist, domain, ds_link); else uma_zfree(domainset_zone, domain); return (ndomain); } /* * Are any of the domains in the mask empty? If so, silently * remove them and update the domainset accordingly. If only empty * domains are present, we must return failure. */ static bool domainset_empty_vm(struct domainset *domain) { domainset_t empty; int i, j; DOMAINSET_ZERO(&empty); for (i = 0; i < vm_ndomains; i++) if (VM_DOMAIN_EMPTY(i)) DOMAINSET_SET(i, &empty); if (DOMAINSET_SUBSET(&empty, &domain->ds_mask)) return (true); /* Remove empty domains from the set and recompute. */ DOMAINSET_ANDNOT(&domain->ds_mask, &empty); domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask); for (i = j = 0; i < DOMAINSET_FLS(&domain->ds_mask); i++) if (DOMAINSET_ISSET(i, &domain->ds_mask)) domain->ds_order[j++] = i; /* Convert a PREFER policy referencing an empty domain to RR. */ if (domain->ds_policy == DOMAINSET_POLICY_PREFER && DOMAINSET_ISSET(domain->ds_prefer, &empty)) { domain->ds_policy = DOMAINSET_POLICY_ROUNDROBIN; domain->ds_prefer = -1; } return (false); } /* * Create or lookup a domainset based on the key held in 'domain'. */ struct domainset * domainset_create(const struct domainset *domain) { struct domainset *ndomain; /* * Validate the policy. It must specify a useable policy number with * only valid domains. Preferred must include the preferred domain * in the mask. */ if (domain->ds_policy <= DOMAINSET_POLICY_INVALID || domain->ds_policy > DOMAINSET_POLICY_MAX) return (NULL); if (domain->ds_policy == DOMAINSET_POLICY_PREFER && !DOMAINSET_ISSET(domain->ds_prefer, &domain->ds_mask)) return (NULL); if (!DOMAINSET_SUBSET(&domainset0->ds_mask, &domain->ds_mask)) return (NULL); ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO); domainset_copy(domain, ndomain); return _domainset_create(ndomain, NULL); } /* * Update thread domainset pointers. */ static void domainset_notify(void) { struct thread *td; struct proc *p; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); td->td_domain.dr_policy = td->td_cpuset->cs_domain; thread_unlock(td); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); kernel_object->domain.dr_policy = cpuset_kernel->cs_domain; } /* * Create a new set that is a subset of a parent. */ static struct domainset * domainset_shadow(const struct domainset *pdomain, const struct domainset *domain, struct domainlist *freelist) { struct domainset *ndomain; ndomain = LIST_FIRST(freelist); LIST_REMOVE(ndomain, ds_link); /* * Initialize the key from the request. */ domainset_copy(domain, ndomain); /* * Restrict the key by the parent. */ DOMAINSET_AND(&ndomain->ds_mask, &pdomain->ds_mask); return _domainset_create(ndomain, freelist); } /* * Recursively check for errors that would occur from applying mask to * the tree of sets starting at 'set'. Checks for sets that would become * empty as well as RDONLY flags. */ static int cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int augment_mask) { struct cpuset *nset; cpuset_t newmask; int error; mtx_assert(&cpuset_lock, MA_OWNED); if (set->cs_flags & CPU_SET_RDONLY) return (EPERM); if (augment_mask) { CPU_AND(&newmask, &set->cs_mask, mask); } else CPU_COPY(mask, &newmask); if (CPU_EMPTY(&newmask)) return (EDEADLK); error = 0; LIST_FOREACH(nset, &set->cs_children, cs_siblings) if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0) break; return (error); } /* * Applies the mask 'mask' without checking for empty sets or permissions. */ static void cpuset_update(struct cpuset *set, cpuset_t *mask) { struct cpuset *nset; mtx_assert(&cpuset_lock, MA_OWNED); CPU_AND(&set->cs_mask, &set->cs_mask, mask); LIST_FOREACH(nset, &set->cs_children, cs_siblings) cpuset_update(nset, &set->cs_mask); return; } /* * Modify the set 'set' to use a copy of the mask provided. Apply this new * mask to restrict all children in the tree. Checks for validity before * applying the changes. */ static int cpuset_modify(struct cpuset *set, cpuset_t *mask) { struct cpuset *root; int error; error = priv_check(curthread, PRIV_SCHED_CPUSET); if (error) return (error); /* * In case we are called from within the jail, * we do not allow modifying the dedicated root * cpuset of the jail but may still allow to * change child sets, including subordinate jails' * roots. */ if ((set->cs_flags & CPU_SET_ROOT) != 0 && jailed(curthread->td_ucred) && set == curthread->td_ucred->cr_prison->pr_cpuset) return (EPERM); /* * Verify that we have access to this set of * cpus. */ if ((set->cs_flags & (CPU_SET_ROOT | CPU_SET_RDONLY)) == CPU_SET_ROOT) { KASSERT(set->cs_parent != NULL, ("jail.cpuset=%d is not a proper child of parent jail's root.", set->cs_id)); /* * cpuset_getroot() cannot work here due to how top-level jail * roots are constructed. Top-level jails are parented to * thread0's cpuset (i.e. cpuset 1) rather than the system root. */ root = set->cs_parent; } else { root = cpuset_getroot(set); } mtx_lock_spin(&cpuset_lock); if (root && !CPU_SUBSET(&root->cs_mask, mask)) { error = EINVAL; goto out; } error = cpuset_testupdate(set, mask, 0); if (error) goto out; CPU_COPY(mask, &set->cs_mask); cpuset_update(set, mask); out: mtx_unlock_spin(&cpuset_lock); return (error); } /* * Recursively check for errors that would occur from applying mask to * the tree of sets starting at 'set'. Checks for sets that would become * empty as well as RDONLY flags. */ static int cpuset_testupdate_domain(struct cpuset *set, struct domainset *dset, struct domainset *orig, int *count, int augment_mask __unused) { struct cpuset *nset; struct domainset *domain; struct domainset newset; int error; mtx_assert(&cpuset_lock, MA_OWNED); if (set->cs_flags & CPU_SET_RDONLY) return (EPERM); domain = set->cs_domain; domainset_copy(domain, &newset); if (!domainset_equal(domain, orig)) { if (!domainset_restrict(domain, dset)) return (EDEADLK); DOMAINSET_AND(&newset.ds_mask, &dset->ds_mask); /* Count the number of domains that are changing. */ (*count)++; } error = 0; LIST_FOREACH(nset, &set->cs_children, cs_siblings) if ((error = cpuset_testupdate_domain(nset, &newset, domain, count, 1)) != 0) break; return (error); } /* * Applies the mask 'mask' without checking for empty sets or permissions. */ static void cpuset_update_domain(struct cpuset *set, struct domainset *domain, struct domainset *orig, struct domainlist *domains) { struct cpuset *nset; mtx_assert(&cpuset_lock, MA_OWNED); /* * If this domainset has changed from the parent we must calculate * a new set. Otherwise it simply inherits from the parent. When * we inherit from the parent we get a new mask and policy. If the * set is modified from the parent we keep the policy and only * update the mask. */ if (set->cs_domain != orig) { orig = set->cs_domain; set->cs_domain = domainset_shadow(domain, orig, domains); } else set->cs_domain = domain; LIST_FOREACH(nset, &set->cs_children, cs_siblings) cpuset_update_domain(nset, set->cs_domain, orig, domains); return; } /* * Modify the set 'set' to use a copy the domainset provided. Apply this new * mask to restrict all children in the tree. Checks for validity before * applying the changes. */ static int cpuset_modify_domain(struct cpuset *set, struct domainset *domain) { struct domainlist domains; struct domainset temp; struct domainset *dset; struct cpuset *root; int ndomains, needed; int error; error = priv_check(curthread, PRIV_SCHED_CPUSET); if (error) return (error); /* * In case we are called from within the jail * we do not allow modifying the dedicated root * cpuset of the jail but may still allow to * change child sets. */ if (jailed(curthread->td_ucred) && set->cs_flags & CPU_SET_ROOT) return (EPERM); domainset_freelist_init(&domains, 0); domain = domainset_create(domain); ndomains = 0; mtx_lock_spin(&cpuset_lock); for (;;) { root = cpuset_getroot(set); dset = root->cs_domain; /* * Verify that we have access to this set of domains. */ if (!domainset_valid(dset, domain)) { error = EINVAL; goto out; } /* * If applying prefer we keep the current set as the fallback. */ if (domain->ds_policy == DOMAINSET_POLICY_PREFER) DOMAINSET_COPY(&set->cs_domain->ds_mask, &domain->ds_mask); /* * Determine whether we can apply this set of domains and * how many new domain structures it will require. */ domainset_copy(domain, &temp); needed = 0; error = cpuset_testupdate_domain(set, &temp, set->cs_domain, &needed, 0); if (error) goto out; if (ndomains >= needed) break; /* Dropping the lock; we'll need to re-evaluate again. */ mtx_unlock_spin(&cpuset_lock); domainset_freelist_add(&domains, needed - ndomains); ndomains = needed; mtx_lock_spin(&cpuset_lock); } dset = set->cs_domain; cpuset_update_domain(set, domain, dset, &domains); out: mtx_unlock_spin(&cpuset_lock); domainset_freelist_free(&domains); if (error == 0) domainset_notify(); return (error); } /* * Resolve the 'which' parameter of several cpuset apis. * * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid. Also * checks for permission via p_cansched(). * * For WHICH_SET returns a valid set with a new reference. * * -1 may be supplied for any argument to mean the current proc/thread or * the base set of the current thread. May fail with ESRCH/EPERM. */ int cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp, struct cpuset **setp) { struct cpuset *set; struct thread *td; struct proc *p; int error; *pp = p = NULL; *tdp = td = NULL; *setp = set = NULL; switch (which) { case CPU_WHICH_PID: if (id == -1) { PROC_LOCK(curproc); p = curproc; break; } if ((p = pfind(id)) == NULL) return (ESRCH); break; case CPU_WHICH_TID: if (id == -1) { PROC_LOCK(curproc); p = curproc; td = curthread; break; } td = tdfind(id, -1); if (td == NULL) return (ESRCH); p = td->td_proc; break; case CPU_WHICH_CPUSET: if (id == -1) { thread_lock(curthread); set = cpuset_refbase(curthread->td_cpuset); thread_unlock(curthread); } else set = cpuset_lookup(id, curthread); if (set) { *setp = set; return (0); } return (ESRCH); case CPU_WHICH_JAIL: { /* Find `set' for prison with given id. */ struct prison *pr; sx_slock(&allprison_lock); pr = prison_find_child(curthread->td_ucred->cr_prison, id); sx_sunlock(&allprison_lock); if (pr == NULL) return (ESRCH); cpuset_ref(pr->pr_cpuset); *setp = pr->pr_cpuset; mtx_unlock(&pr->pr_mtx); return (0); } case CPU_WHICH_IRQ: case CPU_WHICH_DOMAIN: return (0); default: return (EINVAL); } error = p_cansched(curthread, p); if (error) { PROC_UNLOCK(p); return (error); } if (td == NULL) td = FIRST_THREAD_IN_PROC(p); *pp = p; *tdp = td; return (0); } static int cpuset_testshadow(struct cpuset *set, const cpuset_t *mask, const struct domainset *domain) { struct cpuset *parent; struct domainset *dset; parent = cpuset_getbase(set); /* * If we are restricting a cpu mask it must be a subset of the * parent or invalid CPUs have been specified. */ if (mask != NULL && !CPU_SUBSET(&parent->cs_mask, mask)) return (EINVAL); /* * If we are restricting a domain mask it must be a subset of the * parent or invalid domains have been specified. */ dset = parent->cs_domain; if (domain != NULL && !domainset_valid(dset, domain)) return (EINVAL); return (0); } /* * Create an anonymous set with the provided mask in the space provided by * 'nset'. If the passed in set is anonymous we use its parent otherwise * the new set is a child of 'set'. */ static int cpuset_shadow(struct cpuset *set, struct cpuset **nsetp, const cpuset_t *mask, const struct domainset *domain, struct setlist *cpusets, struct domainlist *domains) { struct cpuset *parent; struct cpuset *nset; struct domainset *dset; struct domainset *d; int error; error = cpuset_testshadow(set, mask, domain); if (error) return (error); parent = cpuset_getbase(set); dset = parent->cs_domain; if (mask == NULL) mask = &set->cs_mask; if (domain != NULL) d = domainset_shadow(dset, domain, domains); else d = set->cs_domain; nset = LIST_FIRST(cpusets); error = cpuset_init(nset, parent, mask, d, CPUSET_INVALID); if (error == 0) { LIST_REMOVE(nset, cs_link); *nsetp = nset; } return (error); } static struct cpuset * cpuset_update_thread(struct thread *td, struct cpuset *nset) { struct cpuset *tdset; tdset = td->td_cpuset; td->td_cpuset = nset; td->td_domain.dr_policy = nset->cs_domain; sched_affinity(td); return (tdset); } static int cpuset_setproc_test_maskthread(struct cpuset *tdset, cpuset_t *mask, struct domainset *domain) { struct cpuset *parent; parent = cpuset_getbase(tdset); if (mask == NULL) mask = &tdset->cs_mask; if (domain == NULL) domain = tdset->cs_domain; return cpuset_testshadow(parent, mask, domain); } static int cpuset_setproc_maskthread(struct cpuset *tdset, cpuset_t *mask, struct domainset *domain, struct cpuset **nsetp, struct setlist *freelist, struct domainlist *domainlist) { struct cpuset *parent; parent = cpuset_getbase(tdset); if (mask == NULL) mask = &tdset->cs_mask; if (domain == NULL) domain = tdset->cs_domain; return cpuset_shadow(parent, nsetp, mask, domain, freelist, domainlist); } static int cpuset_setproc_setthread_mask(struct cpuset *tdset, struct cpuset *set, cpuset_t *mask, struct domainset *domain) { struct cpuset *parent; parent = cpuset_getbase(tdset); /* * If the thread restricted its mask then apply that same * restriction to the new set, otherwise take it wholesale. */ if (CPU_CMP(&tdset->cs_mask, &parent->cs_mask) != 0) { CPU_AND(mask, &tdset->cs_mask, &set->cs_mask); } else CPU_COPY(&set->cs_mask, mask); /* * If the thread restricted the domain then we apply the * restriction to the new set but retain the policy. */ if (tdset->cs_domain != parent->cs_domain) { domainset_copy(tdset->cs_domain, domain); DOMAINSET_AND(&domain->ds_mask, &set->cs_domain->ds_mask); } else domainset_copy(set->cs_domain, domain); if (CPU_EMPTY(mask) || DOMAINSET_EMPTY(&domain->ds_mask)) return (EDEADLK); return (0); } static int cpuset_setproc_test_setthread(struct cpuset *tdset, struct cpuset *set) { struct domainset domain; cpuset_t mask; if (tdset->cs_id != CPUSET_INVALID) return (0); return cpuset_setproc_setthread_mask(tdset, set, &mask, &domain); } static int cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set, struct cpuset **nsetp, struct setlist *freelist, struct domainlist *domainlist) { struct domainset domain; cpuset_t mask; int error; /* * If we're replacing on a thread that has not constrained the * original set we can simply accept the new set. */ if (tdset->cs_id != CPUSET_INVALID) { *nsetp = cpuset_ref(set); return (0); } error = cpuset_setproc_setthread_mask(tdset, set, &mask, &domain); if (error) return (error); return cpuset_shadow(set, nsetp, &mask, &domain, freelist, domainlist); } static int cpuset_setproc_newbase(struct thread *td, struct cpuset *set, struct cpuset *nroot, struct cpuset **nsetp, struct setlist *cpusets, struct domainlist *domainlist) { struct domainset ndomain; cpuset_t nmask; struct cpuset *pbase; int error; pbase = cpuset_getbase(td->td_cpuset); /* Copy process mask, then further apply the new root mask. */ CPU_AND(&nmask, &pbase->cs_mask, &nroot->cs_mask); domainset_copy(pbase->cs_domain, &ndomain); DOMAINSET_AND(&ndomain.ds_mask, &set->cs_domain->ds_mask); /* Policy is too restrictive, will not work. */ if (CPU_EMPTY(&nmask) || DOMAINSET_EMPTY(&ndomain.ds_mask)) return (EDEADLK); /* * Remove pbase from the freelist in advance, it'll be pushed to * cpuset_ids on success. We assume here that cpuset_create() will not * touch pbase on failure, and we just enqueue it back to the freelist * to remain in a consistent state. */ pbase = LIST_FIRST(cpusets); LIST_REMOVE(pbase, cs_link); error = cpuset_create(&pbase, set, &nmask); if (error != 0) { LIST_INSERT_HEAD(cpusets, pbase, cs_link); return (error); } /* Duplicates some work from above... oh well. */ pbase->cs_domain = domainset_shadow(set->cs_domain, &ndomain, domainlist); *nsetp = pbase; return (0); } /* * Handle four cases for updating an entire process. * * 1) Set is non-null and the process is not rebasing onto a new root. This * reparents all anonymous sets to the provided set and replaces all * non-anonymous td_cpusets with the provided set. * 2) Set is non-null and the process is rebasing onto a new root. This * creates a new base set if the process previously had its own base set, * then reparents all anonymous sets either to that set or the provided set * if one was not created. Non-anonymous sets are similarly replaced. * 3) Mask is non-null. This replaces or creates anonymous sets for every * thread with the existing base as a parent. * 4) domain is non-null. This creates anonymous sets for every thread * and replaces the domain set. * * This is overly complicated because we can't allocate while holding a * spinlock and spinlocks must be held while changing and examining thread * state. */ static int cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask, struct domainset *domain, bool rebase) { struct setlist freelist; struct setlist droplist; struct domainlist domainlist; struct cpuset *base, *nset, *nroot, *tdroot; struct thread *td; struct proc *p; int needed; int nfree; int error; /* * The algorithm requires two passes due to locking considerations. * * 1) Lookup the process and acquire the locks in the required order. * 2) If enough cpusets have not been allocated release the locks and * allocate them. Loop. */ cpuset_freelist_init(&freelist, 1); domainset_freelist_init(&domainlist, 1); nfree = 1; LIST_INIT(&droplist); nfree = 0; base = set; nroot = NULL; if (set != NULL) nroot = cpuset_getroot(set); for (;;) { error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset); if (error) goto out; tdroot = cpuset_getroot(td->td_cpuset); needed = p->p_numthreads; if (set != NULL && rebase && tdroot != nroot) needed++; if (nfree >= needed) break; PROC_UNLOCK(p); if (nfree < needed) { cpuset_freelist_add(&freelist, needed - nfree); domainset_freelist_add(&domainlist, needed - nfree); nfree = needed; } } PROC_LOCK_ASSERT(p, MA_OWNED); /* * If we're changing roots and the root set is what has been specified * as the parent, then we'll check if the process was previously using * the root set and, if it wasn't, create a new base with the process's * mask applied to it. * * If the new root is incompatible with the existing mask, then we allow * the process to take on the new root if and only if they have * privilege to widen their mask anyways. Unprivileged processes get * rejected with EDEADLK. */ if (set != NULL && rebase && nroot != tdroot) { cpusetid_t base_id, root_id; root_id = td->td_ucred->cr_prison->pr_cpuset->cs_id; base_id = cpuset_getbase(td->td_cpuset)->cs_id; if (base_id != root_id) { error = cpuset_setproc_newbase(td, set, nroot, &base, &freelist, &domainlist); if (error == EDEADLK && priv_check(td, PRIV_SCHED_CPUSET) == 0) error = 0; if (error != 0) goto unlock_out; } } /* * Now that the appropriate locks are held and we have enough cpusets, * make sure the operation will succeed before applying changes. The * proc lock prevents td_cpuset from changing between calls. */ error = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (set != NULL) error = cpuset_setproc_test_setthread(td->td_cpuset, base); else error = cpuset_setproc_test_maskthread(td->td_cpuset, mask, domain); thread_unlock(td); if (error) goto unlock_out; } /* * Replace each thread's cpuset while using deferred release. We * must do this because the thread lock must be held while operating * on the thread and this limits the type of operations allowed. */ FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (set != NULL) error = cpuset_setproc_setthread(td->td_cpuset, base, &nset, &freelist, &domainlist); else error = cpuset_setproc_maskthread(td->td_cpuset, mask, domain, &nset, &freelist, &domainlist); if (error) { thread_unlock(td); break; } cpuset_rel_defer(&droplist, cpuset_update_thread(td, nset)); thread_unlock(td); } unlock_out: PROC_UNLOCK(p); out: if (base != NULL && base != set) cpuset_rel(base); while ((nset = LIST_FIRST(&droplist)) != NULL) cpuset_rel_complete(nset); cpuset_freelist_free(&freelist); domainset_freelist_free(&domainlist); return (error); } static int bitset_strprint(char *buf, size_t bufsiz, const struct bitset *set, int setlen) { size_t bytes; int i, once; char *p; once = 0; p = buf; for (i = 0; i < __bitset_words(setlen); i++) { if (once != 0) { if (bufsiz < 1) return (0); *p = ','; p++; bufsiz--; } else once = 1; if (bufsiz < sizeof(__STRING(ULONG_MAX))) return (0); bytes = snprintf(p, bufsiz, "%lx", set->__bits[i]); p += bytes; bufsiz -= bytes; } return (p - buf); } static int bitset_strscan(struct bitset *set, int setlen, const char *buf) { int i, ret; const char *p; BIT_ZERO(setlen, set); p = buf; for (i = 0; i < __bitset_words(setlen); i++) { if (*p == ',') { p++; continue; } ret = sscanf(p, "%lx", &set->__bits[i]); if (ret == 0 || ret == -1) break; while (isxdigit(*p)) p++; } return (p - buf); } /* * Return a string representing a valid layout for a cpuset_t object. * It expects an incoming buffer at least sized as CPUSETBUFSIZ. */ char * cpusetobj_strprint(char *buf, const cpuset_t *set) { bitset_strprint(buf, CPUSETBUFSIZ, (const struct bitset *)set, CPU_SETSIZE); return (buf); } /* * Build a valid cpuset_t object from a string representation. * It expects an incoming buffer at least sized as CPUSETBUFSIZ. */ int cpusetobj_strscan(cpuset_t *set, const char *buf) { char p; if (strlen(buf) > CPUSETBUFSIZ - 1) return (-1); p = buf[bitset_strscan((struct bitset *)set, CPU_SETSIZE, buf)]; if (p != '\0') return (-1); return (0); } /* * Handle a domainset specifier in the sysctl tree. A poiner to a pointer to * a domainset is in arg1. If the user specifies a valid domainset the * pointer is updated. * * Format is: * hex mask word 0,hex mask word 1,...:decimal policy:decimal preferred */ int sysctl_handle_domainset(SYSCTL_HANDLER_ARGS) { char buf[DOMAINSETBUFSIZ]; struct domainset *dset; struct domainset key; int policy, prefer, error; char *p; dset = *(struct domainset **)arg1; error = 0; if (dset != NULL) { p = buf + bitset_strprint(buf, DOMAINSETBUFSIZ, (const struct bitset *)&dset->ds_mask, DOMAINSET_SETSIZE); sprintf(p, ":%d:%d", dset->ds_policy, dset->ds_prefer); } else sprintf(buf, ""); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); /* * Read in and validate the string. */ memset(&key, 0, sizeof(key)); p = &buf[bitset_strscan((struct bitset *)&key.ds_mask, DOMAINSET_SETSIZE, buf)]; if (p == buf) return (EINVAL); if (sscanf(p, ":%d:%d", &policy, &prefer) != 2) return (EINVAL); key.ds_policy = policy; key.ds_prefer = prefer; /* Domainset_create() validates the policy.*/ dset = domainset_create(&key); if (dset == NULL) return (EINVAL); *(struct domainset **)arg1 = dset; return (error); } /* * Apply an anonymous mask or a domain to a single thread. */ static int _cpuset_setthread(lwpid_t id, cpuset_t *mask, struct domainset *domain) { struct setlist cpusets; struct domainlist domainlist; struct cpuset *nset; struct cpuset *set; struct thread *td; struct proc *p; int error; cpuset_freelist_init(&cpusets, 1); domainset_freelist_init(&domainlist, domain != NULL); error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set); if (error) goto out; set = NULL; thread_lock(td); error = cpuset_shadow(td->td_cpuset, &nset, mask, domain, &cpusets, &domainlist); if (error == 0) set = cpuset_update_thread(td, nset); thread_unlock(td); PROC_UNLOCK(p); if (set) cpuset_rel(set); out: cpuset_freelist_free(&cpusets); domainset_freelist_free(&domainlist); return (error); } /* * Apply an anonymous mask to a single thread. */ int cpuset_setthread(lwpid_t id, cpuset_t *mask) { return _cpuset_setthread(id, mask, NULL); } /* * Apply new cpumask to the ithread. */ int cpuset_setithread(lwpid_t id, int cpu) { cpuset_t mask; CPU_ZERO(&mask); if (cpu == NOCPU) CPU_COPY(cpuset_root, &mask); else CPU_SET(cpu, &mask); return _cpuset_setthread(id, &mask, NULL); } /* * Initialize static domainsets after NUMA information is available. This is * called before memory allocators are initialized. */ void domainset_init(void) { struct domainset *dset; int i; dset = &domainset_firsttouch; DOMAINSET_COPY(&all_domains, &dset->ds_mask); dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH; dset->ds_prefer = -1; _domainset_create(dset, NULL); dset = &domainset_interleave; DOMAINSET_COPY(&all_domains, &dset->ds_mask); dset->ds_policy = DOMAINSET_POLICY_INTERLEAVE; dset->ds_prefer = -1; _domainset_create(dset, NULL); dset = &domainset_roundrobin; DOMAINSET_COPY(&all_domains, &dset->ds_mask); dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN; dset->ds_prefer = -1; _domainset_create(dset, NULL); for (i = 0; i < vm_ndomains; i++) { dset = &domainset_fixed[i]; DOMAINSET_ZERO(&dset->ds_mask); DOMAINSET_SET(i, &dset->ds_mask); dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN; _domainset_create(dset, NULL); dset = &domainset_prefer[i]; DOMAINSET_COPY(&all_domains, &dset->ds_mask); dset->ds_policy = DOMAINSET_POLICY_PREFER; dset->ds_prefer = i; _domainset_create(dset, NULL); } } /* * Define the domainsets for cpuset 0, 1 and cpuset 2. */ void domainset_zero(void) { struct domainset *dset, *tmp; mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE); domainset0 = &domainset_firsttouch; curthread->td_domain.dr_policy = domainset0; domainset2 = &domainset_interleave; kernel_object->domain.dr_policy = domainset2; /* Remove empty domains from the global policies. */ LIST_FOREACH_SAFE(dset, &cpuset_domains, ds_link, tmp) if (domainset_empty_vm(dset)) LIST_REMOVE(dset, ds_link); } /* * Creates system-wide cpusets and the cpuset for thread0 including three * sets: * * 0 - The root set which should represent all valid processors in the * system. This set is immutable. * 1 - The default set which all processes are a member of until changed. * This allows an administrator to move all threads off of given cpus to * dedicate them to high priority tasks or save power etc. * 2 - The kernel set which allows restriction and policy to be applied only * to kernel threads and the kernel_object. */ struct cpuset * cpuset_thread0(void) { struct cpuset *set; int i; int error __unused; cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); domainset_zone = uma_zcreate("domainset", sizeof(struct domainset), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); /* * Create the root system set (0) for the whole machine. Doesn't use * cpuset_create() due to NULL parent. */ set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); CPU_COPY(&all_cpus, &set->cs_mask); LIST_INIT(&set->cs_children); LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); refcount_init(&set->cs_ref, 1); set->cs_flags = CPU_SET_ROOT | CPU_SET_RDONLY; set->cs_domain = domainset0; cpuset_zero = set; cpuset_root = &set->cs_mask; /* * Now derive a default (1), modifiable set from that to give out. */ set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); error = cpuset_init(set, cpuset_zero, NULL, NULL, 1); KASSERT(error == 0, ("Error creating default set: %d\n", error)); cpuset_default = set; /* * Create the kernel set (2). */ set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); error = cpuset_init(set, cpuset_zero, NULL, NULL, 2); KASSERT(error == 0, ("Error creating kernel set: %d\n", error)); set->cs_domain = domainset2; cpuset_kernel = set; /* * Initialize the unit allocator. 0 and 1 are allocated above. */ cpuset_unr = new_unrhdr(3, INT_MAX, NULL); /* * If MD code has not initialized per-domain cpusets, place all * CPUs in domain 0. */ for (i = 0; i < MAXMEMDOM; i++) if (!CPU_EMPTY(&cpuset_domain[i])) goto domains_set; CPU_COPY(&all_cpus, &cpuset_domain[0]); domains_set: return (cpuset_default); } void cpuset_kernthread(struct thread *td) { struct cpuset *set; thread_lock(td); set = td->td_cpuset; td->td_cpuset = cpuset_ref(cpuset_kernel); thread_unlock(td); cpuset_rel(set); } /* * Create a cpuset, which would be cpuset_create() but * mark the new 'set' as root. * * We are not going to reparent the td to it. Use cpuset_setproc_update_set() * for that. * * In case of no error, returns the set in *setp locked with a reference. */ int cpuset_create_root(struct prison *pr, struct cpuset **setp) { struct cpuset *set; int error; KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__)); KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__)); set = NULL; error = cpuset_create(&set, pr->pr_cpuset, &pr->pr_cpuset->cs_mask); if (error) return (error); KASSERT(set != NULL, ("[%s:%d] cpuset_create returned invalid data", __func__, __LINE__)); /* Mark the set as root. */ set->cs_flags |= CPU_SET_ROOT; *setp = set; return (0); } int cpuset_setproc_update_set(struct proc *p, struct cpuset *set) { int error; KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__)); KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__)); cpuset_ref(set); error = cpuset_setproc(p->p_pid, set, NULL, NULL, true); if (error) return (error); cpuset_rel(set); return (0); } /* * In Capability mode, the only accesses that are permitted are to the current * thread and process' CPU and domain sets. */ static int cpuset_check_capabilities(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id) { if (IN_CAPABILITY_MODE(td)) { if (level != CPU_LEVEL_WHICH) return (ECAPMODE); if (which != CPU_WHICH_TID && which != CPU_WHICH_PID) return (ECAPMODE); if (id != -1 && !(which == CPU_WHICH_TID && id == td->td_tid) && !(which == CPU_WHICH_PID && id == td->td_proc->p_pid)) return (ECAPMODE); } return (0); } #if defined(__powerpc__) /* * TODO: At least powerpc64 and powerpc64le kernels panic with * exception 0x480 (instruction segment exception) when copyin/copyout, * are set as a function pointer in cpuset_copy_cb struct and called by * an external module (like pfsync). Tip: copyin/copyout have an ifunc * resolver function. * * Bisect of LLVM shows that the behavior changed on LLVM 10.0 with * https://reviews.llvm.org/rGdc06b0bc9ad055d06535462d91bfc2a744b2f589 * * This is a hack/workaround while problem is being discussed with LLVM * community */ static int cpuset_copyin(const void *uaddr, void *kaddr, size_t len) { return(copyin(uaddr, kaddr, len)); } static int cpuset_copyout(const void *kaddr, void *uaddr, size_t len) { return(copyout(kaddr, uaddr, len)); } static const struct cpuset_copy_cb copy_set = { .cpuset_copyin = cpuset_copyin, .cpuset_copyout = cpuset_copyout }; #else static const struct cpuset_copy_cb copy_set = { .cpuset_copyin = copyin, .cpuset_copyout = copyout }; #endif #ifndef _SYS_SYSPROTO_H_ struct cpuset_args { cpusetid_t *setid; }; #endif int sys_cpuset(struct thread *td, struct cpuset_args *uap) { struct cpuset *root; struct cpuset *set; int error; thread_lock(td); root = cpuset_refroot(td->td_cpuset); thread_unlock(td); set = NULL; error = cpuset_create(&set, root, &root->cs_mask); cpuset_rel(root); if (error) return (error); error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id)); if (error == 0) error = cpuset_setproc(-1, set, NULL, NULL, false); cpuset_rel(set); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_setid_args { cpuwhich_t which; id_t id; cpusetid_t setid; }; #endif int sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap) { return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid)); } int kern_cpuset_setid(struct thread *td, cpuwhich_t which, id_t id, cpusetid_t setid) { struct cpuset *set; int error; /* * Presently we only support per-process sets. */ if (which != CPU_WHICH_PID) return (EINVAL); set = cpuset_lookup(setid, td); if (set == NULL) return (ESRCH); error = cpuset_setproc(id, set, NULL, NULL, false); cpuset_rel(set); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_getid_args { cpulevel_t level; cpuwhich_t which; id_t id; cpusetid_t *setid; }; #endif int sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap) { return (kern_cpuset_getid(td, uap->level, uap->which, uap->id, uap->setid)); } int kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, cpusetid_t *setid) { struct cpuset *nset; struct cpuset *set; struct thread *ttd; struct proc *p; cpusetid_t tmpid; int error; if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET) return (EINVAL); error = cpuset_which(which, id, &p, &ttd, &set); if (error) return (error); switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_refbase(ttd->td_cpuset); thread_unlock(ttd); PROC_UNLOCK(p); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_DOMAIN: return (EINVAL); } switch (level) { case CPU_LEVEL_ROOT: nset = cpuset_refroot(set); cpuset_rel(set); set = nset; break; case CPU_LEVEL_CPUSET: break; case CPU_LEVEL_WHICH: break; } tmpid = set->cs_id; cpuset_rel(set); if (error == 0) error = copyout(&tmpid, setid, sizeof(tmpid)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_getaffinity_args { cpulevel_t level; cpuwhich_t which; id_t id; size_t cpusetsize; cpuset_t *mask; }; #endif int sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap) { return (user_cpuset_getaffinity(td, uap->level, uap->which, uap->id, uap->cpusetsize, uap->mask, ©_set)); } int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *mask) { struct thread *ttd; struct cpuset *nset; struct cpuset *set; struct proc *p; int error; error = cpuset_check_capabilities(td, level, which, id); if (error != 0) return (error); error = cpuset_which(which, id, &p, &ttd, &set); if (error != 0) return (error); switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_ref(ttd->td_cpuset); thread_unlock(ttd); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: return (EINVAL); } if (level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); else nset = cpuset_refbase(set); CPU_COPY(&nset->cs_mask, mask); cpuset_rel(nset); break; case CPU_LEVEL_WHICH: switch (which) { case CPU_WHICH_TID: thread_lock(ttd); CPU_COPY(&ttd->td_cpuset->cs_mask, mask); thread_unlock(ttd); break; case CPU_WHICH_PID: FOREACH_THREAD_IN_PROC(p, ttd) { thread_lock(ttd); CPU_OR(mask, mask, &ttd->td_cpuset->cs_mask); thread_unlock(ttd); } break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: CPU_COPY(&set->cs_mask, mask); break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: error = intr_getaffinity(id, which, mask); break; case CPU_WHICH_DOMAIN: if (id < 0 || id >= MAXMEMDOM) error = ESRCH; else CPU_COPY(&cpuset_domain[id], mask); break; } break; default: error = EINVAL; break; } if (set) cpuset_rel(set); if (p) PROC_UNLOCK(p); if (error == 0) { if (cpusetsize < howmany(CPU_FLS(mask), NBBY)) return (ERANGE); #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrcpuset(mask, cpusetsize); #endif } return (error); } int user_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp, const struct cpuset_copy_cb *cb) { cpuset_t *mask; size_t size; int error; mask = malloc(sizeof(cpuset_t), M_TEMP, M_WAITOK | M_ZERO); size = min(cpusetsize, sizeof(cpuset_t)); error = kern_cpuset_getaffinity(td, level, which, id, size, mask); if (error == 0) { error = cb->cpuset_copyout(mask, maskp, size); if (error != 0) goto out; if (cpusetsize > size) { char *end; char *cp; int rv; end = cp = (char *)&maskp->__bits; end += cpusetsize; cp += size; while (cp != end) { rv = subyte(cp, 0); if (rv == -1) { error = EFAULT; goto out; } cp++; } } } out: free(mask, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_setaffinity_args { cpulevel_t level; cpuwhich_t which; id_t id; size_t cpusetsize; const cpuset_t *mask; }; #endif int sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap) { return (user_cpuset_setaffinity(td, uap->level, uap->which, uap->id, uap->cpusetsize, uap->mask, ©_set)); } int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, cpuset_t *mask) { struct cpuset *nset; struct cpuset *set; struct thread *ttd; struct proc *p; int error; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrcpuset(mask, sizeof(cpuset_t)); #endif error = cpuset_check_capabilities(td, level, which, id); if (error != 0) return (error); if (CPU_EMPTY(mask)) return (EDEADLK); switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: error = cpuset_which(which, id, &p, &ttd, &set); if (error) break; switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_ref(ttd->td_cpuset); thread_unlock(ttd); PROC_UNLOCK(p); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: return (EINVAL); } if (level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); else nset = cpuset_refbase(set); error = cpuset_modify(nset, mask); cpuset_rel(nset); cpuset_rel(set); break; case CPU_LEVEL_WHICH: switch (which) { case CPU_WHICH_TID: error = cpuset_setthread(id, mask); break; case CPU_WHICH_PID: error = cpuset_setproc(id, NULL, mask, NULL, false); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: error = cpuset_which(which, id, &p, &ttd, &set); if (error == 0) { error = cpuset_modify(set, mask); cpuset_rel(set); } break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: error = intr_setaffinity(id, which, mask); break; default: error = EINVAL; break; } break; default: error = EINVAL; break; } return (error); } int user_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, const cpuset_t *maskp, const struct cpuset_copy_cb *cb) { cpuset_t *mask; int error; size_t size; size = min(cpusetsize, sizeof(cpuset_t)); mask = malloc(sizeof(cpuset_t), M_TEMP, M_WAITOK | M_ZERO); error = cb->cpuset_copyin(maskp, mask, size); if (error) goto out; /* * Verify that no high bits are set. */ if (cpusetsize > sizeof(cpuset_t)) { const char *end, *cp; int val; end = cp = (const char *)&maskp->__bits; end += cpusetsize; cp += sizeof(cpuset_t); while (cp != end) { val = fubyte(cp); if (val == -1) { error = EFAULT; goto out; } if (val != 0) { error = EINVAL; goto out; } cp++; } } error = kern_cpuset_setaffinity(td, level, which, id, mask); out: free(mask, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_getdomain_args { cpulevel_t level; cpuwhich_t which; id_t id; size_t domainsetsize; domainset_t *mask; int *policy; }; #endif int sys_cpuset_getdomain(struct thread *td, struct cpuset_getdomain_args *uap) { return (kern_cpuset_getdomain(td, uap->level, uap->which, uap->id, uap->domainsetsize, uap->mask, uap->policy, ©_set)); } int kern_cpuset_getdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t domainsetsize, domainset_t *maskp, int *policyp, const struct cpuset_copy_cb *cb) { struct domainset outset; struct thread *ttd; struct cpuset *nset; struct cpuset *set; struct domainset *dset; struct proc *p; domainset_t *mask; int error; if (domainsetsize < sizeof(domainset_t) || domainsetsize > DOMAINSET_MAXSIZE / NBBY) return (ERANGE); error = cpuset_check_capabilities(td, level, which, id); if (error != 0) return (error); mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO); bzero(&outset, sizeof(outset)); error = cpuset_which(which, id, &p, &ttd, &set); if (error) goto out; switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_ref(ttd->td_cpuset); thread_unlock(ttd); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: error = EINVAL; goto out; } if (level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); else nset = cpuset_refbase(set); domainset_copy(nset->cs_domain, &outset); cpuset_rel(nset); break; case CPU_LEVEL_WHICH: switch (which) { case CPU_WHICH_TID: thread_lock(ttd); domainset_copy(ttd->td_cpuset->cs_domain, &outset); thread_unlock(ttd); break; case CPU_WHICH_PID: FOREACH_THREAD_IN_PROC(p, ttd) { thread_lock(ttd); dset = ttd->td_cpuset->cs_domain; /* Show all domains in the proc. */ DOMAINSET_OR(&outset.ds_mask, &dset->ds_mask); /* Last policy wins. */ outset.ds_policy = dset->ds_policy; outset.ds_prefer = dset->ds_prefer; thread_unlock(ttd); } break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: domainset_copy(set->cs_domain, &outset); break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: error = EINVAL; break; } break; default: error = EINVAL; break; } if (set) cpuset_rel(set); if (p) PROC_UNLOCK(p); /* * Translate prefer into a set containing only the preferred domain, * not the entire fallback set. */ if (outset.ds_policy == DOMAINSET_POLICY_PREFER) { DOMAINSET_ZERO(&outset.ds_mask); DOMAINSET_SET(outset.ds_prefer, &outset.ds_mask); } DOMAINSET_COPY(&outset.ds_mask, mask); if (error == 0) error = cb->cpuset_copyout(mask, maskp, domainsetsize); if (error == 0) if (suword32(policyp, outset.ds_policy) != 0) error = EFAULT; out: free(mask, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_setdomain_args { cpulevel_t level; cpuwhich_t which; id_t id; size_t domainsetsize; domainset_t *mask; int policy; }; #endif int sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap) { return (kern_cpuset_setdomain(td, uap->level, uap->which, uap->id, uap->domainsetsize, uap->mask, uap->policy, ©_set)); } int kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t domainsetsize, const domainset_t *maskp, int policy, const struct cpuset_copy_cb *cb) { struct cpuset *nset; struct cpuset *set; struct thread *ttd; struct proc *p; struct domainset domain; domainset_t *mask; int error; if (domainsetsize < sizeof(domainset_t) || domainsetsize > DOMAINSET_MAXSIZE / NBBY) return (ERANGE); if (policy <= DOMAINSET_POLICY_INVALID || policy > DOMAINSET_POLICY_MAX) return (EINVAL); error = cpuset_check_capabilities(td, level, which, id); if (error != 0) return (error); memset(&domain, 0, sizeof(domain)); mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO); error = cb->cpuset_copyin(maskp, mask, domainsetsize); if (error) goto out; /* * Verify that no high bits are set. */ if (domainsetsize > sizeof(domainset_t)) { char *end; char *cp; end = cp = (char *)&mask->__bits; end += domainsetsize; cp += sizeof(domainset_t); while (cp != end) if (*cp++ != 0) { error = EINVAL; goto out; } } if (DOMAINSET_EMPTY(mask)) { error = EDEADLK; goto out; } DOMAINSET_COPY(mask, &domain.ds_mask); domain.ds_policy = policy; /* * Sanitize the provided mask. */ if (!DOMAINSET_SUBSET(&all_domains, &domain.ds_mask)) { error = EINVAL; goto out; } /* Translate preferred policy into a mask and fallback. */ if (policy == DOMAINSET_POLICY_PREFER) { /* Only support a single preferred domain. */ if (DOMAINSET_COUNT(&domain.ds_mask) != 1) { error = EINVAL; goto out; } domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1; /* This will be constrained by domainset_shadow(). */ DOMAINSET_COPY(&all_domains, &domain.ds_mask); } /* * When given an impossible policy, fall back to interleaving * across all domains. */ if (domainset_empty_vm(&domain)) domainset_copy(domainset2, &domain); switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: error = cpuset_which(which, id, &p, &ttd, &set); if (error) break; switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_ref(ttd->td_cpuset); thread_unlock(ttd); PROC_UNLOCK(p); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: error = EINVAL; goto out; } if (level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); else nset = cpuset_refbase(set); error = cpuset_modify_domain(nset, &domain); cpuset_rel(nset); cpuset_rel(set); break; case CPU_LEVEL_WHICH: switch (which) { case CPU_WHICH_TID: error = _cpuset_setthread(id, NULL, &domain); break; case CPU_WHICH_PID: error = cpuset_setproc(id, NULL, NULL, &domain, false); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: error = cpuset_which(which, id, &p, &ttd, &set); if (error == 0) { error = cpuset_modify_domain(set, &domain); cpuset_rel(set); } break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: default: error = EINVAL; break; } break; default: error = EINVAL; break; } out: free(mask, M_TEMP); return (error); } #ifdef DDB static void ddb_display_bitset(const struct bitset *set, int size) { int bit, once; for (once = 0, bit = 0; bit < size; bit++) { if (CPU_ISSET(bit, set)) { if (once == 0) { db_printf("%d", bit); once = 1; } else db_printf(",%d", bit); } } if (once == 0) db_printf(""); } void ddb_display_cpuset(const cpuset_t *set) { ddb_display_bitset((const struct bitset *)set, CPU_SETSIZE); } static void ddb_display_domainset(const domainset_t *set) { ddb_display_bitset((const struct bitset *)set, DOMAINSET_SETSIZE); } DB_SHOW_COMMAND(cpusets, db_show_cpusets) { struct cpuset *set; LIST_FOREACH(set, &cpuset_ids, cs_link) { db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n", set, set->cs_id, refcount_load(&set->cs_ref), set->cs_flags, (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0); db_printf(" cpu mask="); ddb_display_cpuset(&set->cs_mask); db_printf("\n"); db_printf(" domain policy %d prefer %d mask=", set->cs_domain->ds_policy, set->cs_domain->ds_prefer); ddb_display_domainset(&set->cs_domain->ds_mask); db_printf("\n"); if (db_pager_quit) break; } } DB_SHOW_COMMAND(domainsets, db_show_domainsets) { struct domainset *set; LIST_FOREACH(set, &cpuset_domains, ds_link) { db_printf("set=%p policy %d prefer %d cnt %d\n", set, set->ds_policy, set->ds_prefer, set->ds_cnt); db_printf(" mask ="); ddb_display_domainset(&set->ds_mask); db_printf("\n"); } } #endif /* DDB */ diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c index d66120666f9d..c83e05a4d87a 100644 --- a/sys/kern/subr_smp.c +++ b/sys/kern/subr_smp.c @@ -1,1342 +1,1344 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2001, John Baldwin . * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module holds the global variables and machine independent functions * used for the kernel SMP support. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_sched.h" #ifdef SMP MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data"); volatile cpuset_t stopped_cpus; volatile cpuset_t started_cpus; volatile cpuset_t suspended_cpus; cpuset_t hlt_cpus_mask; cpuset_t logical_cpus_mask; void (*cpustop_restartfunc)(void); #endif static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS); /* This is used in modules that need to work in both SMP and UP. */ cpuset_t all_cpus; int mp_ncpus; /* export this for libkvm consumers. */ int mp_maxcpus = MAXCPU; volatile int smp_started; u_int mp_maxid; static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD | CTLFLAG_CAPRD | CTLFLAG_MPSAFE, NULL, "Kernel SMP"); SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0, "Max CPU ID."); SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus, 0, "Max number of CPUs that the system was compiled for."); SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD|CTLTYPE_INT|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_smp_active, "I", "Indicates system is running in SMP mode"); int smp_disabled = 0; /* has smp been disabled? */ SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD, &smp_disabled, 0, "SMP has been disabled from the loader"); int smp_cpus = 1; /* how many cpu's running */ SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0, "Number of CPUs online"); int smp_threads_per_core = 1; /* how many SMT threads are running per core */ SYSCTL_INT(_kern_smp, OID_AUTO, threads_per_core, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_threads_per_core, 0, "Number of SMT threads online per core"); int mp_ncores = -1; /* how many physical cores running */ SYSCTL_INT(_kern_smp, OID_AUTO, cores, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_ncores, 0, "Number of physical cores online"); int smp_topology = 0; /* Which topology we're using. */ SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0, "Topology override setting; 0 is default provided by hardware."); #ifdef SMP /* Enable forwarding of a signal to a process running on a different CPU */ static int forward_signal_enabled = 1; SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW, &forward_signal_enabled, 0, "Forwarding of a signal to a process on a different CPU"); /* Variables needed for SMP rendezvous. */ static volatile int smp_rv_ncpus; static void (*volatile smp_rv_setup_func)(void *arg); static void (*volatile smp_rv_action_func)(void *arg); static void (*volatile smp_rv_teardown_func)(void *arg); static void *volatile smp_rv_func_arg; static volatile int smp_rv_waiters[4]; /* * Shared mutex to restrict busywaits between smp_rendezvous() and * smp(_targeted)_tlb_shootdown(). A deadlock occurs if both of these * functions trigger at once and cause multiple CPUs to busywait with * interrupts disabled. */ struct mtx smp_ipi_mtx; /* * Let the MD SMP code initialize mp_maxid very early if it can. */ static void mp_setmaxid(void *dummy) { cpu_mp_setmaxid(); KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__)); KASSERT(mp_ncpus > 1 || mp_maxid == 0, ("%s: one CPU but mp_maxid is not zero", __func__)); KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); + + cpusetsizemin = howmany(mp_maxid + 1, NBBY); } SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL); /* * Call the MD SMP initialization code. */ static void mp_start(void *dummy) { mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN); /* Probe for MP hardware. */ if (smp_disabled != 0 || cpu_mp_probe() == 0) { mp_ncores = 1; mp_ncpus = 1; CPU_SETOF(PCPU_GET(cpuid), &all_cpus); return; } cpu_mp_start(); printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n", mp_ncpus); /* Provide a default for most architectures that don't have SMT/HTT. */ if (mp_ncores < 0) mp_ncores = mp_ncpus; cpu_mp_announce(); } SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL); void forward_signal(struct thread *td) { int id; /* * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on * this thread, so all we need to do is poke it if it is currently * executing so that it executes ast(). */ THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_RUNNING(td), ("forward_signal: thread is not TDS_RUNNING")); CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc); if (!smp_started || cold || KERNEL_PANICKED()) return; if (!forward_signal_enabled) return; /* No need to IPI ourself. */ if (td == curthread) return; id = td->td_oncpu; if (id == NOCPU) return; ipi_cpu(id, IPI_AST); } /* * When called the executing CPU will send an IPI to all other CPUs * requesting that they halt execution. * * Usually (but not necessarily) called with 'other_cpus' as its arg. * * - Signals all CPUs in map to stop. * - Waits for each to stop. * * Returns: * -1: error * 0: NA * 1: ok * */ #if defined(__amd64__) || defined(__i386__) #define X86 1 #else #define X86 0 #endif static int generic_stop_cpus(cpuset_t map, u_int type) { #ifdef KTR char cpusetbuf[CPUSETBUFSIZ]; #endif static volatile u_int stopping_cpu = NOCPU; int i; volatile cpuset_t *cpus; KASSERT( type == IPI_STOP || type == IPI_STOP_HARD #if X86 || type == IPI_SUSPEND #endif , ("%s: invalid stop type", __func__)); if (!smp_started) return (0); CTR2(KTR_SMP, "stop_cpus(%s) with %u type", cpusetobj_strprint(cpusetbuf, &map), type); #if X86 /* * When suspending, ensure there are are no IPIs in progress. * IPIs that have been issued, but not yet delivered (e.g. * not pending on a vCPU when running under virtualization) * will be lost, violating FreeBSD's assumption of reliable * IPI delivery. */ if (type == IPI_SUSPEND) mtx_lock_spin(&smp_ipi_mtx); #endif #if X86 if (!nmi_is_broadcast || nmi_kdb_lock == 0) { #endif if (stopping_cpu != PCPU_GET(cpuid)) while (atomic_cmpset_int(&stopping_cpu, NOCPU, PCPU_GET(cpuid)) == 0) while (stopping_cpu != NOCPU) cpu_spinwait(); /* spin */ /* send the stop IPI to all CPUs in map */ ipi_selected(map, type); #if X86 } #endif #if X86 if (type == IPI_SUSPEND) cpus = &suspended_cpus; else #endif cpus = &stopped_cpus; i = 0; while (!CPU_SUBSET(cpus, &map)) { /* spin */ cpu_spinwait(); i++; if (i == 100000000) { printf("timeout stopping cpus\n"); break; } } #if X86 if (type == IPI_SUSPEND) mtx_unlock_spin(&smp_ipi_mtx); #endif stopping_cpu = NOCPU; return (1); } int stop_cpus(cpuset_t map) { return (generic_stop_cpus(map, IPI_STOP)); } int stop_cpus_hard(cpuset_t map) { return (generic_stop_cpus(map, IPI_STOP_HARD)); } #if X86 int suspend_cpus(cpuset_t map) { return (generic_stop_cpus(map, IPI_SUSPEND)); } #endif /* * Called by a CPU to restart stopped CPUs. * * Usually (but not necessarily) called with 'stopped_cpus' as its arg. * * - Signals all CPUs in map to restart. * - Waits for each to restart. * * Returns: * -1: error * 0: NA * 1: ok */ static int generic_restart_cpus(cpuset_t map, u_int type) { #ifdef KTR char cpusetbuf[CPUSETBUFSIZ]; #endif volatile cpuset_t *cpus; #if X86 KASSERT(type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND, ("%s: invalid stop type", __func__)); if (!smp_started) return (0); CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map)); if (type == IPI_SUSPEND) cpus = &resuming_cpus; else cpus = &stopped_cpus; /* signal other cpus to restart */ if (type == IPI_SUSPEND) CPU_COPY_STORE_REL(&map, &toresume_cpus); else CPU_COPY_STORE_REL(&map, &started_cpus); /* * Wake up any CPUs stopped with MWAIT. From MI code we can't tell if * MONITOR/MWAIT is enabled, but the potentially redundant writes are * relatively inexpensive. */ if (type == IPI_STOP) { struct monitorbuf *mb; u_int id; CPU_FOREACH(id) { if (!CPU_ISSET(id, &map)) continue; mb = &pcpu_find(id)->pc_monitorbuf; atomic_store_int(&mb->stop_state, MONITOR_STOPSTATE_RUNNING); } } if (!nmi_is_broadcast || nmi_kdb_lock == 0) { /* wait for each to clear its bit */ while (CPU_OVERLAP(cpus, &map)) cpu_spinwait(); } #else /* !X86 */ KASSERT(type == IPI_STOP || type == IPI_STOP_HARD, ("%s: invalid stop type", __func__)); if (!smp_started) return (0); CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map)); cpus = &stopped_cpus; /* signal other cpus to restart */ CPU_COPY_STORE_REL(&map, &started_cpus); /* wait for each to clear its bit */ while (CPU_OVERLAP(cpus, &map)) cpu_spinwait(); #endif return (1); } int restart_cpus(cpuset_t map) { return (generic_restart_cpus(map, IPI_STOP)); } #if X86 int resume_cpus(cpuset_t map) { return (generic_restart_cpus(map, IPI_SUSPEND)); } #endif #undef X86 /* * All-CPU rendezvous. CPUs are signalled, all execute the setup function * (if specified), rendezvous, execute the action function (if specified), * rendezvous again, execute the teardown function (if specified), and then * resume. * * Note that the supplied external functions _must_ be reentrant and aware * that they are running in parallel and in an unknown lock context. */ void smp_rendezvous_action(void) { struct thread *td; void *local_func_arg; void (*local_setup_func)(void*); void (*local_action_func)(void*); void (*local_teardown_func)(void*); #ifdef INVARIANTS int owepreempt; #endif /* Ensure we have up-to-date values. */ atomic_add_acq_int(&smp_rv_waiters[0], 1); while (smp_rv_waiters[0] < smp_rv_ncpus) cpu_spinwait(); /* Fetch rendezvous parameters after acquire barrier. */ local_func_arg = smp_rv_func_arg; local_setup_func = smp_rv_setup_func; local_action_func = smp_rv_action_func; local_teardown_func = smp_rv_teardown_func; /* * Use a nested critical section to prevent any preemptions * from occurring during a rendezvous action routine. * Specifically, if a rendezvous handler is invoked via an IPI * and the interrupted thread was in the critical_exit() * function after setting td_critnest to 0 but before * performing a deferred preemption, this routine can be * invoked with td_critnest set to 0 and td_owepreempt true. * In that case, a critical_exit() during the rendezvous * action would trigger a preemption which is not permitted in * a rendezvous action. To fix this, wrap all of the * rendezvous action handlers in a critical section. We * cannot use a regular critical section however as having * critical_exit() preempt from this routine would also be * problematic (the preemption must not occur before the IPI * has been acknowledged via an EOI). Instead, we * intentionally ignore td_owepreempt when leaving the * critical section. This should be harmless because we do * not permit rendezvous action routines to schedule threads, * and thus td_owepreempt should never transition from 0 to 1 * during this routine. */ td = curthread; td->td_critnest++; #ifdef INVARIANTS owepreempt = td->td_owepreempt; #endif /* * If requested, run a setup function before the main action * function. Ensure all CPUs have completed the setup * function before moving on to the action function. */ if (local_setup_func != smp_no_rendezvous_barrier) { if (local_setup_func != NULL) local_setup_func(local_func_arg); atomic_add_int(&smp_rv_waiters[1], 1); while (smp_rv_waiters[1] < smp_rv_ncpus) cpu_spinwait(); } if (local_action_func != NULL) local_action_func(local_func_arg); if (local_teardown_func != smp_no_rendezvous_barrier) { /* * Signal that the main action has been completed. If a * full exit rendezvous is requested, then all CPUs will * wait here until all CPUs have finished the main action. */ atomic_add_int(&smp_rv_waiters[2], 1); while (smp_rv_waiters[2] < smp_rv_ncpus) cpu_spinwait(); if (local_teardown_func != NULL) local_teardown_func(local_func_arg); } /* * Signal that the rendezvous is fully completed by this CPU. * This means that no member of smp_rv_* pseudo-structure will be * accessed by this target CPU after this point; in particular, * memory pointed by smp_rv_func_arg. * * The release semantic ensures that all accesses performed by * the current CPU are visible when smp_rendezvous_cpus() * returns, by synchronizing with the * atomic_load_acq_int(&smp_rv_waiters[3]). */ atomic_add_rel_int(&smp_rv_waiters[3], 1); td->td_critnest--; KASSERT(owepreempt == td->td_owepreempt, ("rendezvous action changed td_owepreempt")); } void smp_rendezvous_cpus(cpuset_t map, void (* setup_func)(void *), void (* action_func)(void *), void (* teardown_func)(void *), void *arg) { int curcpumap, i, ncpus = 0; /* See comments in the !SMP case. */ if (!smp_started) { spinlock_enter(); if (setup_func != NULL) setup_func(arg); if (action_func != NULL) action_func(arg); if (teardown_func != NULL) teardown_func(arg); spinlock_exit(); return; } /* * Make sure we come here with interrupts enabled. Otherwise we * livelock if smp_ipi_mtx is owned by a thread which sent us an IPI. */ MPASS(curthread->td_md.md_spinlock_count == 0); CPU_FOREACH(i) { if (CPU_ISSET(i, &map)) ncpus++; } if (ncpus == 0) panic("ncpus is 0 with non-zero map"); mtx_lock_spin(&smp_ipi_mtx); /* Pass rendezvous parameters via global variables. */ smp_rv_ncpus = ncpus; smp_rv_setup_func = setup_func; smp_rv_action_func = action_func; smp_rv_teardown_func = teardown_func; smp_rv_func_arg = arg; smp_rv_waiters[1] = 0; smp_rv_waiters[2] = 0; smp_rv_waiters[3] = 0; atomic_store_rel_int(&smp_rv_waiters[0], 0); /* * Signal other processors, which will enter the IPI with * interrupts off. */ curcpumap = CPU_ISSET(curcpu, &map); CPU_CLR(curcpu, &map); ipi_selected(map, IPI_RENDEZVOUS); /* Check if the current CPU is in the map */ if (curcpumap != 0) smp_rendezvous_action(); /* * Ensure that the master CPU waits for all the other * CPUs to finish the rendezvous, so that smp_rv_* * pseudo-structure and the arg are guaranteed to not * be in use. * * Load acquire synchronizes with the release add in * smp_rendezvous_action(), which ensures that our caller sees * all memory actions done by the called functions on other * CPUs. */ while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus) cpu_spinwait(); mtx_unlock_spin(&smp_ipi_mtx); } void smp_rendezvous(void (* setup_func)(void *), void (* action_func)(void *), void (* teardown_func)(void *), void *arg) { smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg); } static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1]; static void smp_topo_fill(struct cpu_group *cg) { int c; for (c = 0; c < cg->cg_children; c++) smp_topo_fill(&cg->cg_child[c]); cg->cg_first = CPU_FFS(&cg->cg_mask) - 1; cg->cg_last = CPU_FLS(&cg->cg_mask) - 1; } struct cpu_group * smp_topo(void) { char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ]; struct cpu_group *top; /* * Check for a fake topology request for debugging purposes. */ switch (smp_topology) { case 1: /* Dual core with no sharing. */ top = smp_topo_1level(CG_SHARE_NONE, 2, 0); break; case 2: /* No topology, all cpus are equal. */ top = smp_topo_none(); break; case 3: /* Dual core with shared L2. */ top = smp_topo_1level(CG_SHARE_L2, 2, 0); break; case 4: /* quad core, shared l3 among each package, private l2. */ top = smp_topo_1level(CG_SHARE_L3, 4, 0); break; case 5: /* quad core, 2 dualcore parts on each package share l2. */ top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0); break; case 6: /* Single-core 2xHTT */ top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT); break; case 7: /* quad core with a shared l3, 8 threads sharing L2. */ top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8, CG_FLAG_SMT); break; default: /* Default, ask the system what it wants. */ top = cpu_topo(); break; } /* * Verify the returned topology. */ if (top->cg_count != mp_ncpus) panic("Built bad topology at %p. CPU count %d != %d", top, top->cg_count, mp_ncpus); if (CPU_CMP(&top->cg_mask, &all_cpus)) panic("Built bad topology at %p. CPU mask (%s) != (%s)", top, cpusetobj_strprint(cpusetbuf, &top->cg_mask), cpusetobj_strprint(cpusetbuf2, &all_cpus)); /* * Collapse nonsense levels that may be created out of convenience by * the MD layers. They cause extra work in the search functions. */ while (top->cg_children == 1) { top = &top->cg_child[0]; top->cg_parent = NULL; } smp_topo_fill(top); return (top); } struct cpu_group * smp_topo_alloc(u_int count) { static u_int index; u_int curr; curr = index; index += count; return (&group[curr]); } struct cpu_group * smp_topo_none(void) { struct cpu_group *top; top = &group[0]; top->cg_parent = NULL; top->cg_child = NULL; top->cg_mask = all_cpus; top->cg_count = mp_ncpus; top->cg_children = 0; top->cg_level = CG_SHARE_NONE; top->cg_flags = 0; return (top); } static int smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share, int count, int flags, int start) { char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ]; cpuset_t mask; int i; CPU_ZERO(&mask); for (i = 0; i < count; i++, start++) CPU_SET(start, &mask); child->cg_parent = parent; child->cg_child = NULL; child->cg_children = 0; child->cg_level = share; child->cg_count = count; child->cg_flags = flags; child->cg_mask = mask; parent->cg_children++; for (; parent != NULL; parent = parent->cg_parent) { if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask)) panic("Duplicate children in %p. mask (%s) child (%s)", parent, cpusetobj_strprint(cpusetbuf, &parent->cg_mask), cpusetobj_strprint(cpusetbuf2, &child->cg_mask)); CPU_OR(&parent->cg_mask, &parent->cg_mask, &child->cg_mask); parent->cg_count += child->cg_count; } return (start); } struct cpu_group * smp_topo_1level(int share, int count, int flags) { struct cpu_group *child; struct cpu_group *top; int packages; int cpu; int i; cpu = 0; top = &group[0]; packages = mp_ncpus / count; top->cg_child = child = &group[1]; top->cg_level = CG_SHARE_NONE; for (i = 0; i < packages; i++, child++) cpu = smp_topo_addleaf(top, child, share, count, flags, cpu); return (top); } struct cpu_group * smp_topo_2level(int l2share, int l2count, int l1share, int l1count, int l1flags) { struct cpu_group *top; struct cpu_group *l1g; struct cpu_group *l2g; int cpu; int i; int j; cpu = 0; top = &group[0]; l2g = &group[1]; top->cg_child = l2g; top->cg_level = CG_SHARE_NONE; top->cg_children = mp_ncpus / (l2count * l1count); l1g = l2g + top->cg_children; for (i = 0; i < top->cg_children; i++, l2g++) { l2g->cg_parent = top; l2g->cg_child = l1g; l2g->cg_level = l2share; for (j = 0; j < l2count; j++, l1g++) cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count, l1flags, cpu); } return (top); } struct cpu_group * smp_topo_find(struct cpu_group *top, int cpu) { struct cpu_group *cg; cpuset_t mask; int children; int i; CPU_SETOF(cpu, &mask); cg = top; for (;;) { if (!CPU_OVERLAP(&cg->cg_mask, &mask)) return (NULL); if (cg->cg_children == 0) return (cg); children = cg->cg_children; for (i = 0, cg = cg->cg_child; i < children; cg++, i++) if (CPU_OVERLAP(&cg->cg_mask, &mask)) break; } return (NULL); } #else /* !SMP */ void smp_rendezvous_cpus(cpuset_t map, void (*setup_func)(void *), void (*action_func)(void *), void (*teardown_func)(void *), void *arg) { /* * In the !SMP case we just need to ensure the same initial conditions * as the SMP case. */ spinlock_enter(); if (setup_func != NULL) setup_func(arg); if (action_func != NULL) action_func(arg); if (teardown_func != NULL) teardown_func(arg); spinlock_exit(); } void smp_rendezvous(void (*setup_func)(void *), void (*action_func)(void *), void (*teardown_func)(void *), void *arg) { smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg); } /* * Provide dummy SMP support for UP kernels. Modules that need to use SMP * APIs will still work using this dummy support. */ static void mp_setvariables_for_up(void *dummy) { mp_ncpus = 1; mp_ncores = 1; mp_maxid = PCPU_GET(cpuid); CPU_SETOF(mp_maxid, &all_cpus); KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero")); } SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setvariables_for_up, NULL); #endif /* SMP */ void smp_no_rendezvous_barrier(void *dummy) { #ifdef SMP KASSERT((!smp_started),("smp_no_rendezvous called and smp is started")); #endif } void smp_rendezvous_cpus_retry(cpuset_t map, void (* setup_func)(void *), void (* action_func)(void *), void (* teardown_func)(void *), void (* wait_func)(void *, int), struct smp_rendezvous_cpus_retry_arg *arg) { int cpu; CPU_COPY(&map, &arg->cpus); /* * Only one CPU to execute on. */ if (!smp_started) { spinlock_enter(); if (setup_func != NULL) setup_func(arg); if (action_func != NULL) action_func(arg); if (teardown_func != NULL) teardown_func(arg); spinlock_exit(); return; } /* * Execute an action on all specified CPUs while retrying until they * all acknowledge completion. */ for (;;) { smp_rendezvous_cpus( arg->cpus, setup_func, action_func, teardown_func, arg); if (CPU_EMPTY(&arg->cpus)) break; CPU_FOREACH(cpu) { if (!CPU_ISSET(cpu, &arg->cpus)) continue; wait_func(arg, cpu); } } } void smp_rendezvous_cpus_done(struct smp_rendezvous_cpus_retry_arg *arg) { CPU_CLR_ATOMIC(curcpu, &arg->cpus); } /* * If (prio & PDROP) == 0: * Wait for specified idle threads to switch once. This ensures that even * preempted threads have cycled through the switch function once, * exiting their codepaths. This allows us to change global pointers * with no other synchronization. * If (prio & PDROP) != 0: * Force the specified CPUs to switch context at least once. */ int quiesce_cpus(cpuset_t map, const char *wmesg, int prio) { struct pcpu *pcpu; u_int *gen; int error; int cpu; error = 0; if ((prio & PDROP) == 0) { gen = malloc(sizeof(u_int) * MAXCPU, M_TEMP, M_WAITOK); for (cpu = 0; cpu <= mp_maxid; cpu++) { if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu)) continue; pcpu = pcpu_find(cpu); gen[cpu] = pcpu->pc_idlethread->td_generation; } } for (cpu = 0; cpu <= mp_maxid; cpu++) { if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu)) continue; pcpu = pcpu_find(cpu); thread_lock(curthread); sched_bind(curthread, cpu); thread_unlock(curthread); if ((prio & PDROP) != 0) continue; while (gen[cpu] == pcpu->pc_idlethread->td_generation) { error = tsleep(quiesce_cpus, prio & ~PDROP, wmesg, 1); if (error != EWOULDBLOCK) goto out; error = 0; } } out: thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); if ((prio & PDROP) == 0) free(gen, M_TEMP); return (error); } int quiesce_all_cpus(const char *wmesg, int prio) { return quiesce_cpus(all_cpus, wmesg, prio); } /* * Observe all CPUs not executing in critical section. * We are not in one so the check for us is safe. If the found * thread changes to something else we know the section was * exited as well. */ void quiesce_all_critical(void) { struct thread *td, *newtd; struct pcpu *pcpu; int cpu; MPASS(curthread->td_critnest == 0); CPU_FOREACH(cpu) { pcpu = cpuid_to_pcpu[cpu]; td = pcpu->pc_curthread; for (;;) { if (td->td_critnest == 0) break; cpu_spinwait(); newtd = (struct thread *) atomic_load_acq_ptr((void *)pcpu->pc_curthread); if (td != newtd) break; } } } static void cpus_fence_seq_cst_issue(void *arg __unused) { atomic_thread_fence_seq_cst(); } /* * Send an IPI forcing a sequentially consistent fence. * * Allows replacement of an explicitly fence with a compiler barrier. * Trades speed up during normal execution for a significant slowdown when * the barrier is needed. */ void cpus_fence_seq_cst(void) { #ifdef SMP smp_rendezvous( smp_no_rendezvous_barrier, cpus_fence_seq_cst_issue, smp_no_rendezvous_barrier, NULL ); #else cpus_fence_seq_cst_issue(NULL); #endif } /* Extra care is taken with this sysctl because the data type is volatile */ static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS) { int error, active; active = smp_started; error = SYSCTL_OUT(req, &active, sizeof(active)); return (error); } #ifdef SMP void topo_init_node(struct topo_node *node) { bzero(node, sizeof(*node)); TAILQ_INIT(&node->children); } void topo_init_root(struct topo_node *root) { topo_init_node(root); root->type = TOPO_TYPE_SYSTEM; } /* * Add a child node with the given ID under the given parent. * Do nothing if there is already a child with that ID. */ struct topo_node * topo_add_node_by_hwid(struct topo_node *parent, int hwid, topo_node_type type, uintptr_t subtype) { struct topo_node *node; TAILQ_FOREACH_REVERSE(node, &parent->children, topo_children, siblings) { if (node->hwid == hwid && node->type == type && node->subtype == subtype) { return (node); } } node = malloc(sizeof(*node), M_TOPO, M_WAITOK); topo_init_node(node); node->parent = parent; node->hwid = hwid; node->type = type; node->subtype = subtype; TAILQ_INSERT_TAIL(&parent->children, node, siblings); parent->nchildren++; return (node); } /* * Find a child node with the given ID under the given parent. */ struct topo_node * topo_find_node_by_hwid(struct topo_node *parent, int hwid, topo_node_type type, uintptr_t subtype) { struct topo_node *node; TAILQ_FOREACH(node, &parent->children, siblings) { if (node->hwid == hwid && node->type == type && node->subtype == subtype) { return (node); } } return (NULL); } /* * Given a node change the order of its parent's child nodes such * that the node becomes the firt child while preserving the cyclic * order of the children. In other words, the given node is promoted * by rotation. */ void topo_promote_child(struct topo_node *child) { struct topo_node *next; struct topo_node *node; struct topo_node *parent; parent = child->parent; next = TAILQ_NEXT(child, siblings); TAILQ_REMOVE(&parent->children, child, siblings); TAILQ_INSERT_HEAD(&parent->children, child, siblings); while (next != NULL) { node = next; next = TAILQ_NEXT(node, siblings); TAILQ_REMOVE(&parent->children, node, siblings); TAILQ_INSERT_AFTER(&parent->children, child, node, siblings); child = node; } } /* * Iterate to the next node in the depth-first search (traversal) of * the topology tree. */ struct topo_node * topo_next_node(struct topo_node *top, struct topo_node *node) { struct topo_node *next; if ((next = TAILQ_FIRST(&node->children)) != NULL) return (next); if ((next = TAILQ_NEXT(node, siblings)) != NULL) return (next); while (node != top && (node = node->parent) != top) if ((next = TAILQ_NEXT(node, siblings)) != NULL) return (next); return (NULL); } /* * Iterate to the next node in the depth-first search of the topology tree, * but without descending below the current node. */ struct topo_node * topo_next_nonchild_node(struct topo_node *top, struct topo_node *node) { struct topo_node *next; if ((next = TAILQ_NEXT(node, siblings)) != NULL) return (next); while (node != top && (node = node->parent) != top) if ((next = TAILQ_NEXT(node, siblings)) != NULL) return (next); return (NULL); } /* * Assign the given ID to the given topology node that represents a logical * processor. */ void topo_set_pu_id(struct topo_node *node, cpuid_t id) { KASSERT(node->type == TOPO_TYPE_PU, ("topo_set_pu_id: wrong node type: %u", node->type)); KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0, ("topo_set_pu_id: cpuset already not empty")); node->id = id; CPU_SET(id, &node->cpuset); node->cpu_count = 1; node->subtype = 1; while ((node = node->parent) != NULL) { KASSERT(!CPU_ISSET(id, &node->cpuset), ("logical ID %u is already set in node %p", id, node)); CPU_SET(id, &node->cpuset); node->cpu_count++; } } static struct topology_spec { topo_node_type type; bool match_subtype; uintptr_t subtype; } topology_level_table[TOPO_LEVEL_COUNT] = { [TOPO_LEVEL_PKG] = { .type = TOPO_TYPE_PKG, }, [TOPO_LEVEL_GROUP] = { .type = TOPO_TYPE_GROUP, }, [TOPO_LEVEL_CACHEGROUP] = { .type = TOPO_TYPE_CACHE, .match_subtype = true, .subtype = CG_SHARE_L3, }, [TOPO_LEVEL_CORE] = { .type = TOPO_TYPE_CORE, }, [TOPO_LEVEL_THREAD] = { .type = TOPO_TYPE_PU, }, }; static bool topo_analyze_table(struct topo_node *root, int all, enum topo_level level, struct topo_analysis *results) { struct topology_spec *spec; struct topo_node *node; int count; if (level >= TOPO_LEVEL_COUNT) return (true); spec = &topology_level_table[level]; count = 0; node = topo_next_node(root, root); while (node != NULL) { if (node->type != spec->type || (spec->match_subtype && node->subtype != spec->subtype)) { node = topo_next_node(root, node); continue; } if (!all && CPU_EMPTY(&node->cpuset)) { node = topo_next_nonchild_node(root, node); continue; } count++; if (!topo_analyze_table(node, all, level + 1, results)) return (false); node = topo_next_nonchild_node(root, node); } /* No explicit subgroups is essentially one subgroup. */ if (count == 0) { count = 1; if (!topo_analyze_table(root, all, level + 1, results)) return (false); } if (results->entities[level] == -1) results->entities[level] = count; else if (results->entities[level] != count) return (false); return (true); } /* * Check if the topology is uniform, that is, each package has the same number * of cores in it and each core has the same number of threads (logical * processors) in it. If so, calculate the number of packages, the number of * groups per package, the number of cachegroups per group, and the number of * logical processors per cachegroup. 'all' parameter tells whether to include * administratively disabled logical processors into the analysis. */ int topo_analyze(struct topo_node *topo_root, int all, struct topo_analysis *results) { results->entities[TOPO_LEVEL_PKG] = -1; results->entities[TOPO_LEVEL_CORE] = -1; results->entities[TOPO_LEVEL_THREAD] = -1; results->entities[TOPO_LEVEL_GROUP] = -1; results->entities[TOPO_LEVEL_CACHEGROUP] = -1; if (!topo_analyze_table(topo_root, all, TOPO_LEVEL_PKG, results)) return (0); KASSERT(results->entities[TOPO_LEVEL_PKG] > 0, ("bug in topology or analysis")); return (1); } #endif /* SMP */ diff --git a/sys/sys/cpuset.h b/sys/sys/cpuset.h index 601da08a46a8..f8fc36b99aa7 100644 --- a/sys/sys/cpuset.h +++ b/sys/sys/cpuset.h @@ -1,190 +1,191 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008, Jeffrey Roberson * All rights reserved. * * Copyright (c) 2008 Nokia Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_CPUSET_H_ #define _SYS_CPUSET_H_ #include #include #include #define _NCPUBITS _BITSET_BITS #define _NCPUWORDS __bitset_words(CPU_SETSIZE) #define CPUSETBUFSIZ ((2 + sizeof(long) * 2) * _NCPUWORDS) #define CPU_CLR(n, p) __BIT_CLR(CPU_SETSIZE, n, p) #define CPU_COPY(f, t) __BIT_COPY(CPU_SETSIZE, f, t) #define CPU_ISSET(n, p) __BIT_ISSET(CPU_SETSIZE, n, p) #define CPU_SET(n, p) __BIT_SET(CPU_SETSIZE, n, p) #define CPU_ZERO(p) __BIT_ZERO(CPU_SETSIZE, p) #define CPU_FILL(p) __BIT_FILL(CPU_SETSIZE, p) #define CPU_SETOF(n, p) __BIT_SETOF(CPU_SETSIZE, n, p) #define CPU_EQUAL(p, c) (__BIT_CMP(CPU_SETSIZE, p, c) == 0) #define CPU_EMPTY(p) __BIT_EMPTY(CPU_SETSIZE, p) #define CPU_ISFULLSET(p) __BIT_ISFULLSET(CPU_SETSIZE, p) #define CPU_SUBSET(p, c) __BIT_SUBSET(CPU_SETSIZE, p, c) #define CPU_OVERLAP(p, c) __BIT_OVERLAP(CPU_SETSIZE, p, c) #define CPU_CMP(p, c) __BIT_CMP(CPU_SETSIZE, p, c) #define CPU_OR(d, s1, s2) __BIT_OR2(CPU_SETSIZE, d, s1, s2) #define CPU_AND(d, s1, s2) __BIT_AND2(CPU_SETSIZE, d, s1, s2) #define CPU_ANDNOT(d, s1, s2) __BIT_ANDNOT2(CPU_SETSIZE, d, s1, s2) #define CPU_XOR(d, s1, s2) __BIT_XOR2(CPU_SETSIZE, d, s1, s2) #define CPU_CLR_ATOMIC(n, p) __BIT_CLR_ATOMIC(CPU_SETSIZE, n, p) #define CPU_SET_ATOMIC(n, p) __BIT_SET_ATOMIC(CPU_SETSIZE, n, p) #define CPU_SET_ATOMIC_ACQ(n, p) __BIT_SET_ATOMIC_ACQ(CPU_SETSIZE, n, p) #define CPU_AND_ATOMIC(n, p) __BIT_AND_ATOMIC(CPU_SETSIZE, n, p) #define CPU_OR_ATOMIC(d, s) __BIT_OR_ATOMIC(CPU_SETSIZE, d, s) #define CPU_COPY_STORE_REL(f, t) __BIT_COPY_STORE_REL(CPU_SETSIZE, f, t) #define CPU_FFS(p) __BIT_FFS(CPU_SETSIZE, p) #define CPU_FLS(p) __BIT_FLS(CPU_SETSIZE, p) #define CPU_FOREACH_ISSET(i, p) __BIT_FOREACH_ISSET(CPU_SETSIZE, i, p) #define CPU_FOREACH_ISCLR(i, p) __BIT_FOREACH_ISCLR(CPU_SETSIZE, i, p) #define CPU_COUNT(p) ((int)__BIT_COUNT(CPU_SETSIZE, p)) #define CPUSET_FSET __BITSET_FSET(_NCPUWORDS) #define CPUSET_T_INITIALIZER(x) __BITSET_T_INITIALIZER(x) #define CPU_ALLOC_SIZE(_s) __BITSET_SIZE(_s) #define CPU_ALLOC(_s) __cpuset_alloc(_s) #define CPU_FREE(p) __cpuset_free(p) #define CPU_ISSET_S(n, _s, p) __BIT_ISSET((_s) * 8, n, p) #define CPU_SET_S(n, _s, p) __BIT_SET((_s) * 8, n, p) #define CPU_CLR_S(n, _s, p) __BIT_CLR((_s) * 8, n, p) #define CPU_ZERO_S(_s, p) __BIT_ZERO((_s) * 8, p) #define CPU_OR_S(_s, d, s1, s2) __BIT_OR2((_s) * 8, d, s1, s2) #define CPU_AND_S(_s, d, s1, s2) __BIT_AND2((_s) * 8, d, s1, s2) #define CPU_XOR_S(_s, d, s1, s2) __BIT_XOR2((_s) * 8, d, s1, s2) #define CPU_COUNT_S(_s, p) ((int)__BIT_COUNT((_s) * 8, p)) #define CPU_EQUAL_S(_s, p, c) (__BIT_CMP((_s) * 8, p, c) == 0) /* * Valid cpulevel_t values. */ #define CPU_LEVEL_ROOT 1 /* All system cpus. */ #define CPU_LEVEL_CPUSET 2 /* Available cpus for which. */ #define CPU_LEVEL_WHICH 3 /* Actual mask/id for which. */ /* * Valid cpuwhich_t values. */ #define CPU_WHICH_TID 1 /* Specifies a thread id. */ #define CPU_WHICH_PID 2 /* Specifies a process id. */ #define CPU_WHICH_CPUSET 3 /* Specifies a set id. */ #define CPU_WHICH_IRQ 4 /* Specifies an irq #. */ #define CPU_WHICH_JAIL 5 /* Specifies a jail id. */ #define CPU_WHICH_DOMAIN 6 /* Specifies a NUMA domain id. */ #define CPU_WHICH_INTRHANDLER 7 /* Specifies an irq # (not ithread). */ #define CPU_WHICH_ITHREAD 8 /* Specifies an irq's ithread. */ /* * Reserved cpuset identifiers. */ #define CPUSET_INVALID -1 #define CPUSET_DEFAULT 0 #ifdef _KERNEL #include LIST_HEAD(setlist, cpuset); +extern u_int cpusetsizemin; /* * cpusets encapsulate cpu binding information for one or more threads. * * a - Accessed with atomics. * s - Set at creation, never modified. Only a ref required to read. * c - Locked internally by a cpuset lock. * * The bitmask is only modified while holding the cpuset lock. It may be * read while only a reference is held but the consumer must be prepared * to deal with inconsistent results. */ struct cpuset { volatile u_int cs_ref; /* (a) Reference count. */ int cs_flags; /* (s) Flags from below. */ LIST_ENTRY(cpuset) cs_link; /* (c) All identified sets. */ LIST_ENTRY(cpuset) cs_siblings; /* (c) Sibling set link. */ struct setlist cs_children; /* (c) List of children. */ struct domainset *cs_domain; /* (c) NUMA policy. */ cpusetid_t cs_id; /* (s) Id or INVALID. */ struct cpuset *cs_parent; /* (s) Pointer to our parent. */ cpuset_t cs_mask; /* bitmask of valid cpus. */ }; #define CPU_SET_ROOT 0x0001 /* Set is a root set. */ #define CPU_SET_RDONLY 0x0002 /* No modification allowed. */ extern cpuset_t *cpuset_root; struct prison; struct proc; struct thread; /* * Callbacks for copying in/out a cpuset or domainset. Used for alternate * ABIs, like compat32. */ struct cpuset_copy_cb { int (*cpuset_copyin)(const void *, void *, size_t); int (*cpuset_copyout)(const void *, void *, size_t); }; struct cpuset *cpuset_thread0(void); struct cpuset *cpuset_ref(struct cpuset *); void cpuset_rel(struct cpuset *); int cpuset_setthread(lwpid_t id, cpuset_t *); int cpuset_setithread(lwpid_t id, int cpu); int cpuset_create_root(struct prison *, struct cpuset **); int cpuset_setproc_update_set(struct proc *, struct cpuset *); int cpuset_which(cpuwhich_t, id_t, struct proc **, struct thread **, struct cpuset **); void cpuset_kernthread(struct thread *); char *cpusetobj_strprint(char *, const cpuset_t *); int cpusetobj_strscan(cpuset_t *, const char *); #ifdef DDB void ddb_display_cpuset(const cpuset_t *); #endif #else __BEGIN_DECLS int cpuset(cpusetid_t *); int cpuset_setid(cpuwhich_t, id_t, cpusetid_t); int cpuset_getid(cpulevel_t, cpuwhich_t, id_t, cpusetid_t *); int cpuset_getaffinity(cpulevel_t, cpuwhich_t, id_t, size_t, cpuset_t *); int cpuset_setaffinity(cpulevel_t, cpuwhich_t, id_t, size_t, const cpuset_t *); __END_DECLS #endif #endif /* !_SYS_CPUSET_H_ */