Index: compat/freebsd32/syscalls.master =================================================================== --- compat/freebsd32/syscalls.master +++ compat/freebsd32/syscalls.master @@ -1119,4 +1119,13 @@ struct kevent32 *eventlist, \ int nevents, \ const struct timespec32 *timeout); } +561 AUE_NULL STD { int cpuset_getdomain(cpulevel_t level, \ + cpuwhich_t which, id_t id, \ + size_t domainsetsize, domainset_t *mask, \ + int *policy); } +562 AUE_NULL STD { int cpuset_setdomain(cpulevel_t level, \ + cpuwhich_t which, id_t id, \ + size_t domainsetsize, domainset_t *mask, \ + int policy); } + ; vim: syntax=off Index: conf/files =================================================================== --- conf/files +++ conf/files @@ -4816,7 +4816,7 @@ vm/uma_core.c standard vm/uma_dbg.c standard vm/memguard.c optional DEBUG_MEMGUARD -vm/vm_domain.c standard +vm/vm_domainset.c standard vm/vm_fault.c standard vm/vm_glue.c standard vm/vm_init.c standard Index: kern/init_main.c =================================================================== --- kern/init_main.c +++ kern/init_main.c @@ -493,10 +493,7 @@ td->td_flags = TDF_INMEM; td->td_pflags = TDP_KTHREAD; td->td_cpuset = cpuset_thread0(); - vm_domain_policy_init(&td->td_vm_dom_policy); - vm_domain_policy_set(&td->td_vm_dom_policy, VM_POLICY_NONE, -1); - vm_domain_policy_init(&p->p_vm_dom_policy); - vm_domain_policy_set(&p->p_vm_dom_policy, VM_POLICY_NONE, -1); + td->td_domain.dr_policy = td->td_cpuset->cs_domain; prison0_init(); p->p_peers = 0; p->p_leader = p; Index: kern/init_sysent.c =================================================================== --- kern/init_sysent.c +++ kern/init_sysent.c @@ -612,4 +612,6 @@ { AS(fhstatfs_args), (sy_call_t *)sys_fhstatfs, AUE_FHSTATFS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 558 = fhstatfs */ { AS(mknodat_args), (sy_call_t *)sys_mknodat, AUE_MKNODAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 559 = mknodat */ { AS(kevent_args), (sy_call_t *)sys_kevent, AUE_KEVENT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 560 = kevent */ + { AS(cpuset_getdomain_args), (sy_call_t *)sys_cpuset_getdomain, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 561 = cpuset_getdomain */ + { AS(cpuset_setdomain_args), (sy_call_t *)sys_cpuset_setdomain, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 562 = cpuset_setdomain */ }; Index: kern/kern_cpuset.c =================================================================== --- kern/kern_cpuset.c +++ kern/kern_cpuset.c @@ -51,17 +51,20 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include #include +#include #ifdef DDB #include @@ -109,8 +112,10 @@ * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...). */ static uma_zone_t cpuset_zone; +static uma_zone_t domainset_zone; static struct mtx cpuset_lock; static struct setlist cpuset_ids; +static struct domainlist cpuset_domains; static struct unrhdr *cpuset_unr; static struct cpuset *cpuset_zero, *cpuset_default; @@ -122,6 +127,30 @@ cpuset_t cpuset_domain[MAXMEMDOM]; /* + * Find the first non-anonymous set starting from 'set'. + */ +static struct cpuset * +cpuset_getbase(struct cpuset *set) +{ + + if (set->cs_id == CPUSET_INVALID) + set = set->cs_parent; + return (set); +} + +/* + * Walks up the tree from 'set' to find the root. + */ +static struct cpuset * +cpuset_getroot(struct cpuset *set) +{ + + while ((set->cs_flags & CPU_SET_ROOT) == 0 && set->cs_parent != NULL) + set = set->cs_parent; + return (set); +} + +/* * Acquire a reference to a cpuset, all pointers must be tracked with refs. */ struct cpuset * @@ -140,12 +169,7 @@ cpuset_refroot(struct cpuset *set) { - for (; set->cs_parent != NULL; set = set->cs_parent) - if (set->cs_flags & CPU_SET_ROOT) - break; - cpuset_ref(set); - - return (set); + return cpuset_ref(cpuset_getroot(set)); } /* @@ -157,11 +181,7 @@ cpuset_refbase(struct cpuset *set) { - if (set->cs_id == CPUSET_INVALID) - set = set->cs_parent; - cpuset_ref(set); - - return (set); + return cpuset_ref(cpuset_getbase(set)); } /* @@ -258,16 +278,24 @@ */ static int _cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask, - cpusetid_t id) + struct domainset *domain, cpusetid_t id) { + if (domain == NULL) + domain = parent->cs_domain; + if (mask == NULL) + mask = &parent->cs_mask; if (!CPU_OVERLAP(&parent->cs_mask, mask)) return (EDEADLK); + /* The domain must be prepared ahead of time. */ + if (!DOMAINSET_SUBSET(&parent->cs_domain->ds_mask, &domain->ds_mask)) + return (EDEADLK); CPU_COPY(mask, &set->cs_mask); LIST_INIT(&set->cs_children); refcount_init(&set->cs_ref, 1); set->cs_flags = 0; mtx_lock_spin(&cpuset_lock); + set->cs_domain = domain; CPU_AND(&set->cs_mask, &parent->cs_mask); set->cs_id = id; set->cs_parent = cpuset_ref(parent); @@ -294,8 +322,8 @@ id = alloc_unr(cpuset_unr); if (id == -1) return (ENFILE); - *setp = set = uma_zalloc(cpuset_zone, M_WAITOK); - error = _cpuset_create(set, parent, mask, id); + *setp = set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); + error = _cpuset_create(set, parent, mask, NULL, id); if (error == 0) return (0); free_unr(cpuset_unr, id); @@ -304,7 +332,158 @@ return (error); } +static void +cpuset_freelist_add(struct setlist *list, int count) +{ + struct cpuset *set; + int i; + + for (i = 0; i < count; i++) { + set = uma_zalloc(cpuset_zone, M_ZERO | M_WAITOK); + LIST_INSERT_HEAD(list, set, cs_link); + } +} + +static void +cpuset_freelist_init(struct setlist *list, int count) +{ + + LIST_INIT(list); + cpuset_freelist_add(list, count); +} + +static void +cpuset_freelist_free(struct setlist *list) +{ + struct cpuset *set; + + while ((set = LIST_FIRST(list)) != NULL) { + LIST_REMOVE(set, cs_link); + uma_zfree(cpuset_zone, set); + } +} + +static void +domainset_freelist_add(struct domainlist *list, int count) +{ + struct domainset *set; + int i; + + for (i = 0; i < count; i++) { + set = uma_zalloc(domainset_zone, M_ZERO | M_WAITOK); + LIST_INSERT_HEAD(list, set, ds_link); + } +} + +static void +domainset_freelist_init(struct domainlist *list, int count) +{ + + LIST_INIT(list); + domainset_freelist_add(list, count); +} + +static void +domainset_freelist_free(struct domainlist *list) +{ + struct domainset *set; + + while ((set = LIST_FIRST(list)) != NULL) { + LIST_REMOVE(set, ds_link); + uma_zfree(cpuset_zone, set); + } +} + +/* Copy a domainset preserving mask and policy. */ +static void +domainset_copy(struct domainset *from, struct domainset *to) +{ + + DOMAINSET_COPY(&from->ds_mask, &to->ds_mask); + to->ds_policy = from->ds_policy; +} + +/* Return 1 if mask and policy are equal, otherwise 0. */ +static int +domainset_equal(struct domainset *one, struct domainset *two) +{ + + return (DOMAINSET_CMP(&one->ds_mask, &two->ds_mask) == 0 && + one->ds_policy == two->ds_policy); +} + /* + * Lookup or create a domainset. The key is provided in ds_mask and + * ds_policy. If the domainset does not yet exist the storage in + * 'domain' is used to insert. Otherwise this storage is freed to the + * domainset_zone and the existing domainset is returned. + */ +static struct domainset * +_domainset_create(struct domainset *domain, struct domainlist *freelist) +{ + struct domainset *ndomain; + + mtx_lock_spin(&cpuset_lock); + LIST_FOREACH(ndomain, &cpuset_domains, ds_link) + if (domainset_equal(ndomain, domain)) + break; + /* + * If the domain does not yet exist we insert it and initialize + * various iteration helpers which are not part of the key. + */ + if (ndomain == NULL) { + LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link); + domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask); + domain->ds_max = DOMAINSET_FLS(&domain->ds_mask); + } + mtx_unlock_spin(&cpuset_lock); + if (ndomain == NULL) + return (domain); + if (freelist != NULL) + LIST_INSERT_HEAD(freelist, domain, ds_link); + else + uma_zfree(domainset_zone, domain); + return (ndomain); + +} + +/* + * Create or lookup a domainset based on the key held in 'domain'. + */ +static struct domainset * +domainset_create(const struct domainset *domain) +{ + struct domainset *ndomain; + + ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO); + ndomain->ds_policy = domain->ds_policy; + DOMAINSET_COPY(&domain->ds_policy, &ndomain->ds_policy); + return _domainset_create(ndomain, NULL); +} + +static struct domainset * +domainset_shadow(struct domainset *pdomain, + struct domainset *domain, struct domainlist *freelist) +{ + struct domainset *ndomain; + + ndomain = LIST_FIRST(freelist); + LIST_REMOVE(ndomain, ds_link); + + /* + * Initialize the key from the request. + */ + domainset_copy(domain, ndomain); + + /* + * Restrict the key by the parent. + */ + DOMAINSET_AND(&ndomain->ds_mask, &pdomain->ds_mask); + + return _domainset_create(ndomain, freelist); +} + +/* * Recursively check for errors that would occur from applying mask to * the tree of sets starting at 'set'. Checks for sets that would become * empty as well as RDONLY flags. @@ -376,10 +555,12 @@ * Verify that we have access to this set of * cpus. */ - root = set->cs_parent; - if (root && !CPU_SUBSET(&root->cs_mask, mask)) - return (EINVAL); + root = cpuset_getroot(set); mtx_lock_spin(&cpuset_lock); + if (root && !CPU_SUBSET(&root->cs_mask, mask)) { + error = EINVAL; + goto out; + } error = cpuset_testupdate(set, mask, 0); if (error) goto out; @@ -392,6 +573,135 @@ } /* + * Recursively check for errors that would occur from applying mask to + * the tree of sets starting at 'set'. Checks for sets that would become + * empty as well as RDONLY flags. + */ +static int +cpuset_testupdate_domain(struct cpuset *set, struct domainset *dset, + struct domainset *orig, int *count, int check_mask) +{ + struct cpuset *nset; + struct domainset *domain; + struct domainset newset; + int error; + + mtx_assert(&cpuset_lock, MA_OWNED); + if (set->cs_flags & CPU_SET_RDONLY) + return (EPERM); + domain = set->cs_domain; + domainset_copy(domain, &newset); + if (!domainset_equal(domain, orig)) { + if (!DOMAINSET_OVERLAP(&domain->ds_mask, &dset->ds_mask)) + return (EDEADLK); + DOMAINSET_AND(&newset.ds_mask, &dset->ds_mask); + /* Count the number of domains that are changing. */ + (*count)++; + } + error = 0; + LIST_FOREACH(nset, &set->cs_children, cs_siblings) + if ((error = cpuset_testupdate_domain(nset, &newset, domain, + count, 1)) != 0) + break; + return (error); +} + +/* + * Applies the mask 'mask' without checking for empty sets or permissions. + */ +static void +cpuset_update_domain(struct cpuset *set, struct domainset *domain, + struct domainset *orig, struct domainlist *domains) +{ + struct cpuset *nset; + + mtx_assert(&cpuset_lock, MA_OWNED); + /* + * If this domainset has changed from the parent we must calculate + * a new set. Otherwise it simply inherits from the parent. When + * we inherit from the parent we get a new mask and policy. If the + * set is modified from the parent we keep the policy and only + * update the mask. + */ + if (set->cs_domain != orig) { + orig = set->cs_domain; + set->cs_domain = domainset_shadow(domain, orig, domains); + } else + set->cs_domain = domain; + LIST_FOREACH(nset, &set->cs_children, cs_siblings) + cpuset_update_domain(nset, set->cs_domain, orig, domains); + + return; +} + +/* + * Modify the set 'set' to use a copy the domainset provided. Apply this new + * mask to restrict all children in the tree. Checks for validity before + * applying the changes. + */ +static int +cpuset_modify_domain(struct cpuset *set, struct domainset *domain) +{ + struct domainlist domains; + struct domainset temp; + struct domainset *dset; + struct cpuset *root; + int ndomains, needed; + int error; + + error = priv_check(curthread, PRIV_SCHED_CPUSET); + if (error) + return (error); + /* + * In case we are called from within the jail + * we do not allow modifying the dedicated root + * cpuset of the jail but may still allow to + * change child sets. + */ + if (jailed(curthread->td_ucred) && + set->cs_flags & CPU_SET_ROOT) + return (EPERM); + domainset_freelist_init(&domains, 0); + domain = domainset_create(domain); + ndomains = needed = 0; + do { + if (ndomains < needed) { + domainset_freelist_add(&domains, needed - ndomains); + ndomains = needed; + } + root = cpuset_getroot(set); + mtx_lock_spin(&cpuset_lock); + dset = root->cs_domain; + /* + * Verify that we have access to this set of domains. + */ + if (root && + !DOMAINSET_SUBSET(&dset->ds_mask, &domain->ds_mask)) { + error = EINVAL; + goto out; + } + /* + * Determine whether we can apply this set of domains and + * how many new domain structures it will require. + */ + domainset_copy(domain, &temp); + needed = 0; + error = cpuset_testupdate_domain(set, &temp, set->cs_domain, + &needed, 0); + if (error) + goto out; + } while (ndomains < needed); + dset = set->cs_domain; + set->cs_domain = domain; + cpuset_update_domain(set, domain, dset, &domains); +out: + mtx_unlock_spin(&cpuset_lock); + domainset_freelist_free(&domains); + + return (error); +} + +/* * Resolve the 'which' parameter of several cpuset apis. * * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid. Also @@ -481,44 +791,203 @@ return (0); } +static int +cpuset_testshadow(struct cpuset *set, const cpuset_t *mask, + struct domainset *domain) +{ + struct cpuset *parent; + struct domainset *dset; + + parent = cpuset_getbase(set); + /* + * If we are restricting a cpu mask it must be a subset of the + * parent or invalid CPUs have been specified. + */ + if (mask != NULL && !CPU_SUBSET(&parent->cs_mask, mask)) + return (EINVAL); + + /* + * If we are restricting a domain mask it must be a subset of the + * parent or invalid domains have been specified. + */ + dset = parent->cs_domain; + if (domain != NULL && + !DOMAINSET_SUBSET(&dset->ds_mask, &domain->ds_mask)) + return (EINVAL); + + return (0); +} + /* * Create an anonymous set with the provided mask in the space provided by - * 'fset'. If the passed in set is anonymous we use its parent otherwise + * 'nset'. If the passed in set is anonymous we use its parent otherwise * the new set is a child of 'set'. */ static int -cpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask) +cpuset_shadow(struct cpuset *set, struct cpuset **nsetp, const cpuset_t *mask, + struct domainset *domain, struct setlist *cpusets, + struct domainlist *domains) { struct cpuset *parent; + struct cpuset *nset; + struct domainset *dset; + int error; - if (set->cs_id == CPUSET_INVALID) - parent = set->cs_parent; + error = cpuset_testshadow(set, mask, domain); + if (error) + return (error); + + parent = cpuset_getbase(set); + dset = parent->cs_domain; + if (mask == NULL) + mask = &set->cs_mask; + if (domain != NULL) + domain = domainset_shadow(dset, domain, domains); else - parent = set; - if (!CPU_SUBSET(&parent->cs_mask, mask)) + domain = set->cs_domain; + nset = LIST_FIRST(cpusets); + error = _cpuset_create(nset, parent, mask, domain, CPUSET_INVALID); + if (error == 0) { + LIST_REMOVE(nset, cs_link); + *nsetp = nset; + } + return (error); +} + +static struct cpuset * +cpuset_update_thread(struct thread *td, struct cpuset *nset) +{ + struct cpuset *tdset; + + tdset = td->td_cpuset; + td->td_cpuset = nset; + td->td_domain.dr_policy = nset->cs_domain; + sched_affinity(td); + + return (tdset); +} + +static int +cpuset_setproc_test_maskthread(struct cpuset *tdset, cpuset_t *mask, + struct domainset *domain) +{ + struct cpuset *parent; + + parent = cpuset_getbase(tdset); + if (mask == NULL) + mask = &tdset->cs_mask; + if (domain == NULL) + domain = tdset->cs_domain; + return cpuset_testshadow(parent, mask, domain); +} + +static int +cpuset_setproc_maskthread(struct cpuset *tdset, cpuset_t *mask, + struct domainset *domain, struct cpuset **nsetp, + struct setlist *freelist, struct domainlist *domainlist) +{ + struct cpuset *parent; + + parent = cpuset_getbase(tdset); + if (mask == NULL) + mask = &tdset->cs_mask; + if (domain == NULL) + domain = tdset->cs_domain; + return cpuset_shadow(parent, nsetp, mask, domain, freelist, + domainlist); +} + +static int +cpuset_setproc_setthread_mask(struct cpuset *tdset, struct cpuset *set, + cpuset_t *mask, struct domainset *domain) +{ + struct cpuset *parent; + + parent = cpuset_getbase(tdset); + + /* + * If the thread restricted its mask then apply that same + * restriction to the new set, otherwise take it wholesale. + */ + if (CPU_CMP(&tdset->cs_mask, &parent->cs_mask) != 0) { + CPU_COPY(&tdset->cs_mask, mask); + CPU_AND(mask, &set->cs_mask); + } else + CPU_COPY(&set->cs_mask, mask); + + /* + * If the thread restricted the domain then we apply the + * restriction to the new set but retain the policy. + */ + if (tdset->cs_domain != parent->cs_domain) { + domainset_copy(tdset->cs_domain, domain); + DOMAINSET_AND(&domain->ds_mask, &set->cs_domain->ds_mask); + } else + domainset_copy(set->cs_domain, domain); + + if (CPU_EMPTY(mask) || DOMAINSET_EMPTY(&domain->ds_mask)) return (EDEADLK); - return (_cpuset_create(fset, parent, mask, CPUSET_INVALID)); + + return (0); } +static int +cpuset_setproc_test_setthread(struct cpuset *tdset, struct cpuset *set) +{ + struct domainset domain; + cpuset_t mask; + + if (tdset->cs_id != CPUSET_INVALID) + return (0); + return cpuset_setproc_setthread_mask(tdset, set, &mask, &domain); +} + +static int +cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set, + struct cpuset **nsetp, struct setlist *freelist, + struct domainlist *domainlist) +{ + struct domainset domain; + cpuset_t mask; + int error; + + /* + * If we're replacing on a thread that has not constrained the + * original set we can simply accept the new set. + */ + if (tdset->cs_id != CPUSET_INVALID) { + *nsetp = cpuset_ref(set); + return (0); + } + error = cpuset_setproc_setthread_mask(tdset, set, &mask, &domain); + if (error) + return (error); + + return cpuset_shadow(tdset, nsetp, &mask, &domain, freelist, + domainlist); +} + /* - * Handle two cases for replacing the base set or mask of an entire process. + * Handle three cases for updating an entire process. * - * 1) Set is non-null and mask is null. This reparents all anonymous sets - * to the provided set and replaces all non-anonymous td_cpusets with the - * provided set. - * 2) Mask is non-null and set is null. This replaces or creates anonymous - * sets for every thread with the existing base as a parent. + * 1) Set is non-null. This reparents all anonymous sets to the provided + * set and replaces all non-anonymous td_cpusets with the provided set. + * 2) Mask is non-null. This replaces or creates anonymous sets for every + * thread with the existing base as a parent. + * 3) domain is non-null. This creates anonymous sets for every thread + * and replaces the domain set. * * This is overly complicated because we can't allocate while holding a * spinlock and spinlocks must be held while changing and examining thread * state. */ static int -cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask) +cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask, + struct domainset *domain) { struct setlist freelist; struct setlist droplist; - struct cpuset *tdset; + struct domainlist domainlist; struct cpuset *nset; struct thread *td; struct proc *p; @@ -533,7 +1002,9 @@ * 2) If enough cpusets have not been allocated release the locks and * allocate them. Loop. */ - LIST_INIT(&freelist); + cpuset_freelist_init(&freelist, 1); + domainset_freelist_init(&domainlist, 1); + nfree = 1; LIST_INIT(&droplist); nfree = 0; for (;;) { @@ -544,39 +1015,27 @@ break; threads = p->p_numthreads; PROC_UNLOCK(p); - for (; nfree < threads; nfree++) { - nset = uma_zalloc(cpuset_zone, M_WAITOK); - LIST_INSERT_HEAD(&freelist, nset, cs_link); + if (nfree < threads) { + cpuset_freelist_add(&freelist, threads - nfree); + domainset_freelist_add(&domainlist, threads - nfree); + nfree = threads; } } PROC_LOCK_ASSERT(p, MA_OWNED); /* * Now that the appropriate locks are held and we have enough cpusets, - * make sure the operation will succeed before applying changes. The + * make sure the operation will succeed before applying changes. The * proc lock prevents td_cpuset from changing between calls. */ error = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); - tdset = td->td_cpuset; - /* - * Verify that a new mask doesn't specify cpus outside of - * the set the thread is a member of. - */ - if (mask) { - if (tdset->cs_id == CPUSET_INVALID) - tdset = tdset->cs_parent; - if (!CPU_SUBSET(&tdset->cs_mask, mask)) - error = EDEADLK; - /* - * Verify that a new set won't leave an existing thread - * mask without a cpu to run on. It can, however, restrict - * the set. - */ - } else if (tdset->cs_id == CPUSET_INVALID) { - if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask)) - error = EDEADLK; - } + if (set != NULL) + error = cpuset_setproc_test_setthread(td->td_cpuset, + set); + else + error = cpuset_setproc_test_maskthread(td->td_cpuset, + mask, domain); thread_unlock(td); if (error) goto unlock_out; @@ -588,33 +1047,17 @@ */ FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); - /* - * If we presently have an anonymous set or are applying a - * mask we must create an anonymous shadow set. That is - * either parented to our existing base or the supplied set. - * - * If we have a base set with no anonymous shadow we simply - * replace it outright. - */ - tdset = td->td_cpuset; - if (tdset->cs_id == CPUSET_INVALID || mask) { - nset = LIST_FIRST(&freelist); - LIST_REMOVE(nset, cs_link); - if (mask) - error = cpuset_shadow(tdset, nset, mask); - else - error = _cpuset_create(nset, set, - &tdset->cs_mask, CPUSET_INVALID); - if (error) { - LIST_INSERT_HEAD(&freelist, nset, cs_link); - thread_unlock(td); - break; - } - } else - nset = cpuset_ref(set); - cpuset_rel_defer(&droplist, tdset); - td->td_cpuset = nset; - sched_affinity(td); + if (set != NULL) + error = cpuset_setproc_setthread(td->td_cpuset, set, + &nset, &freelist, &domainlist); + else + error = cpuset_setproc_maskthread(td->td_cpuset, mask, + domain, &nset, &freelist, &domainlist); + if (error) { + thread_unlock(td); + break; + } + cpuset_rel_defer(&droplist, cpuset_update_thread(td, nset)); thread_unlock(td); } unlock_out: @@ -622,10 +1065,8 @@ out: while ((nset = LIST_FIRST(&droplist)) != NULL) cpuset_rel_complete(nset); - while ((nset = LIST_FIRST(&freelist)) != NULL) { - LIST_REMOVE(nset, cs_link); - uma_zfree(cpuset_zone, nset); - } + cpuset_freelist_free(&freelist); + domainset_freelist_free(&domainlist); return (error); } @@ -690,46 +1131,57 @@ } /* - * Apply an anonymous mask to a single thread. + * Apply an anonymous mask or a domain to a single thread. */ -int -cpuset_setthread(lwpid_t id, cpuset_t *mask) +static int +_cpuset_setthread(lwpid_t id, cpuset_t *mask, struct domainset *domain) { + struct setlist cpusets; + struct domainlist domainlist; struct cpuset *nset; struct cpuset *set; struct thread *td; struct proc *p; int error; - nset = uma_zalloc(cpuset_zone, M_WAITOK); + cpuset_freelist_init(&cpusets, 1); + domainset_freelist_init(&domainlist, domain != NULL); error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set); if (error) goto out; set = NULL; thread_lock(td); - error = cpuset_shadow(td->td_cpuset, nset, mask); - if (error == 0) { - set = td->td_cpuset; - td->td_cpuset = nset; - sched_affinity(td); - nset = NULL; - } + error = cpuset_shadow(td->td_cpuset, &nset, mask, domain, + &cpusets, &domainlist); + if (error == 0) + set = cpuset_update_thread(td, nset); thread_unlock(td); PROC_UNLOCK(p); if (set) cpuset_rel(set); out: - if (nset) - uma_zfree(cpuset_zone, nset); + cpuset_freelist_free(&cpusets); + domainset_freelist_free(&domainlist); return (error); } /* + * Apply an anonymous mask to a single thread. + */ +int +cpuset_setthread(lwpid_t id, cpuset_t *mask) +{ + + return _cpuset_setthread(id, mask, NULL); +} + +/* * Apply new cpumask to the ithread. */ int cpuset_setithread(lwpid_t id, int cpu) { + struct setlist cpusets; struct cpuset *nset, *rset; struct cpuset *parent, *old_set; struct thread *td; @@ -738,8 +1190,8 @@ cpuset_t mask; int error; - nset = uma_zalloc(cpuset_zone, M_WAITOK); - rset = uma_zalloc(cpuset_zone, M_WAITOK); + cpuset_freelist_init(&cpusets, 1); + rset = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); cs_id = CPUSET_INVALID; CPU_ZERO(&mask); @@ -756,13 +1208,15 @@ old_set = td->td_cpuset; if (cpu == NOCPU) { + nset = LIST_FIRST(&cpusets); + LIST_REMOVE(nset, cs_link); /* * roll back to default set. We're not using cpuset_shadow() * here because we can fail CPU_SUBSET() check. This can happen * if default set does not contain all CPUs. */ - error = _cpuset_create(nset, cpuset_default, &mask, + error = _cpuset_create(nset, cpuset_default, &mask, NULL, CPUSET_INVALID); goto applyset; @@ -779,7 +1233,7 @@ * with any mask. */ error = _cpuset_create(rset, cpuset_zero, - &cpuset_zero->cs_mask, cs_id); + &cpuset_zero->cs_mask, NULL, cs_id); if (error != 0) { PROC_UNLOCK(p); goto out; @@ -794,22 +1248,19 @@ old_set = NULL; } - error = cpuset_shadow(parent, nset, &mask); + error = cpuset_shadow(parent, &nset, &mask, NULL, &cpusets, NULL); applyset: if (error == 0) { thread_lock(td); - td->td_cpuset = nset; - sched_affinity(td); + old_set = cpuset_update_thread(td, nset); thread_unlock(td); - nset = NULL; } else old_set = NULL; PROC_UNLOCK(p); if (old_set != NULL) cpuset_rel(old_set); out: - if (nset != NULL) - uma_zfree(cpuset_zone, nset); + cpuset_freelist_free(&cpusets); if (rset != NULL) uma_zfree(cpuset_zone, rset); if (cs_id != CPUSET_INVALID) @@ -817,7 +1268,24 @@ return (error); } +struct domainset domainset0; +void +domainset_zero(void) +{ + struct domainset *dset; + int i; + + mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE); + + dset = &domainset0; + DOMAINSET_ZERO(&dset->ds_mask); + for (i = 0; i < vm_ndomains; i++) + DOMAINSET_SET(i, &dset->ds_mask); + dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN; + curthread->td_domain.dr_policy = _domainset_create(dset, NULL); +} + /* * Creates system-wide cpusets and the cpuset for thread0 including two * sets: @@ -834,11 +1302,12 @@ cpuset_thread0(void) { struct cpuset *set; - int error, i; + int error; cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE); + domainset_zone = uma_zcreate("domainset", sizeof(struct domainset), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Create the root system set for the whole machine. Doesn't use @@ -850,14 +1319,15 @@ LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); set->cs_ref = 1; set->cs_flags = CPU_SET_ROOT; + set->cs_domain = &domainset0; cpuset_zero = set; cpuset_root = &set->cs_mask; /* * Now derive a default, modifiable set from that to give out. */ - set = uma_zalloc(cpuset_zone, M_WAITOK); - error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1); + set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); + error = _cpuset_create(set, cpuset_zero, NULL, NULL, 1); KASSERT(error == 0, ("Error creating default set: %d\n", error)); cpuset_default = set; @@ -866,16 +1336,6 @@ */ cpuset_unr = new_unrhdr(2, INT_MAX, NULL); - /* - * If MD code has not initialized per-domain cpusets, place all - * CPUs in domain 0. - */ - for (i = 0; i < MAXMEMDOM; i++) - if (!CPU_EMPTY(&cpuset_domain[i])) - goto domains_set; - CPU_COPY(&all_cpus, &cpuset_domain[0]); -domains_set: - return (set); } @@ -920,7 +1380,7 @@ KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__)); cpuset_ref(set); - error = cpuset_setproc(p->p_pid, set, NULL); + error = cpuset_setproc(p->p_pid, set, NULL, NULL); if (error) return (error); cpuset_rel(set); @@ -935,11 +1395,23 @@ cpuset_init(void *arg) { cpuset_t mask; + int i; mask = all_cpus; if (cpuset_modify(cpuset_zero, &mask)) panic("Can't set initial cpuset mask.\n"); cpuset_zero->cs_flags |= CPU_SET_RDONLY; + + /* + * If MD code has not initialized per-domain cpusets, place all + * CPUs in domain 0. + */ + for (i = 0; i < MAXMEMDOM; i++) + if (!CPU_EMPTY(&cpuset_domain[i])) + goto domains_set; + CPU_COPY(&all_cpus, &cpuset_domain[0]); +domains_set: + return; } SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL); @@ -964,7 +1436,7 @@ return (error); error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id)); if (error == 0) - error = cpuset_setproc(-1, set, NULL); + error = cpuset_setproc(-1, set, NULL, NULL); cpuset_rel(set); return (error); } @@ -998,7 +1470,7 @@ set = cpuset_lookup(setid, td); if (set == NULL) return (ESRCH); - error = cpuset_setproc(id, set, NULL); + error = cpuset_setproc(id, set, NULL, NULL); cpuset_rel(set); return (error); } @@ -1285,7 +1757,7 @@ error = cpuset_setthread(id, mask); break; case CPU_WHICH_PID: - error = cpuset_setproc(id, NULL, mask); + error = cpuset_setproc(id, NULL, mask, NULL); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: @@ -1314,25 +1786,303 @@ return (error); } +#ifndef _SYS_SYSPROTO_H_ +struct cpuset_getdomain_args { + cpulevel_t level; + cpuwhich_t which; + id_t id; + size_t domainsetsize; + domainset_t *mask; + int *policy; +}; +#endif +int +sys_cpuset_getdomain(struct thread *td, struct cpuset_getdomain_args *uap) +{ + + return (kern_cpuset_getdomain(td, uap->level, uap->which, + uap->id, uap->domainsetsize, uap->mask, uap->policy)); +} + +int +kern_cpuset_getdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, + id_t id, size_t domainsetsize, domainset_t *maskp, int *policyp) +{ + struct thread *ttd; + struct cpuset *nset; + struct cpuset *set; + struct domainset *dset; + struct proc *p; + domainset_t *mask; + int policy; + int error; + size_t size; + + if (domainsetsize < sizeof(domainset_t) || + domainsetsize > DOMAINSET_MAXSIZE / NBBY) + return (ERANGE); + /* In Capability mode, you can only get your own domain set. */ + if (IN_CAPABILITY_MODE(td)) { + if (level != CPU_LEVEL_WHICH) + return (ECAPMODE); + if (which != CPU_WHICH_TID && which != CPU_WHICH_PID) + return (ECAPMODE); + if (id != -1) + return (ECAPMODE); + } + policy = 0; + size = domainsetsize; + mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO); + error = cpuset_which(which, id, &p, &ttd, &set); + if (error) + goto out; + switch (level) { + case CPU_LEVEL_ROOT: + case CPU_LEVEL_CPUSET: + switch (which) { + case CPU_WHICH_TID: + case CPU_WHICH_PID: + thread_lock(ttd); + set = cpuset_ref(ttd->td_cpuset); + thread_unlock(ttd); + break; + case CPU_WHICH_CPUSET: + case CPU_WHICH_JAIL: + break; + case CPU_WHICH_IRQ: + case CPU_WHICH_INTRHANDLER: + case CPU_WHICH_ITHREAD: + case CPU_WHICH_DOMAIN: + error = EINVAL; + goto out; + } + if (level == CPU_LEVEL_ROOT) + nset = cpuset_refroot(set); + else + nset = cpuset_refbase(set); + /* Fetch once for a coherent result. */ + dset = nset->cs_domain; + DOMAINSET_COPY(&dset->ds_mask, mask); + policy = dset->ds_policy; + cpuset_rel(nset); + break; + case CPU_LEVEL_WHICH: + switch (which) { + case CPU_WHICH_TID: + thread_lock(ttd); + /* Fetch once for a coherent result. */ + dset = ttd->td_cpuset->cs_domain; + DOMAINSET_COPY(&dset->ds_mask, mask); + policy = dset->ds_policy; + thread_unlock(ttd); + break; + case CPU_WHICH_PID: + FOREACH_THREAD_IN_PROC(p, ttd) { + thread_lock(ttd); + dset = ttd->td_cpuset->cs_domain; + /* Show all domains in the proc. */ + DOMAINSET_OR(mask, &dset->ds_mask); + /* Last policy wins. */ + policy = dset->ds_policy; + thread_unlock(ttd); + } + break; + case CPU_WHICH_CPUSET: + case CPU_WHICH_JAIL: + dset = set->cs_domain; + policy = dset->ds_policy; + DOMAINSET_OR(mask, &dset->ds_mask); + break; + case CPU_WHICH_IRQ: + case CPU_WHICH_INTRHANDLER: + case CPU_WHICH_ITHREAD: + case CPU_WHICH_DOMAIN: + error = EINVAL; + break; + } + break; + default: + error = EINVAL; + break; + } + if (set) + cpuset_rel(set); + if (p) + PROC_UNLOCK(p); + if (error == 0) + error = copyout(mask, maskp, size); + if (error == 0) + error = copyout(&policy, policyp, sizeof(*policyp)); +out: + free(mask, M_TEMP); + return (error); + return 0; +} + +#ifndef _SYS_SYSPROTO_H_ +struct cpuset_setdomain_args { + cpulevel_t level; + cpuwhich_t which; + id_t id; + size_t domainsetsize; + domainset_t *mask; + int policy; +}; +#endif +int +sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap) +{ + + return (kern_cpuset_setdomain(td, uap->level, uap->which, + uap->id, uap->domainsetsize, uap->mask, uap->policy)); +} + +int +kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, + id_t id, size_t domainsetsize, const domainset_t *maskp, int policy) +{ + struct cpuset *nset; + struct cpuset *set; + struct thread *ttd; + struct proc *p; + struct domainset domain; + domainset_t *mask; + int error; + + if (domainsetsize < sizeof(domainset_t) || + domainsetsize > DOMAINSET_MAXSIZE / NBBY) + return (ERANGE); + /* In Capability mode, you can only set your own CPU set. */ + if (IN_CAPABILITY_MODE(td)) { + if (level != CPU_LEVEL_WHICH) + return (ECAPMODE); + if (which != CPU_WHICH_TID && which != CPU_WHICH_PID) + return (ECAPMODE); + if (id != -1) + return (ECAPMODE); + } + memset(&domain, 0, sizeof(domain)); + mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO); + error = copyin(maskp, mask, domainsetsize); + if (error) + goto out; + /* + * Verify that no high bits are set. + */ + if (domainsetsize > sizeof(domainset_t)) { + char *end; + char *cp; + + end = cp = (char *)&mask->__bits; + end += domainsetsize; + cp += sizeof(domainset_t); + while (cp != end) + if (*cp++ != 0) { + error = EINVAL; + goto out; + } + + } + DOMAINSET_COPY(mask, &domain.ds_mask); + domain.ds_policy = policy; + + switch (level) { + case CPU_LEVEL_ROOT: + case CPU_LEVEL_CPUSET: + error = cpuset_which(which, id, &p, &ttd, &set); + if (error) + break; + switch (which) { + case CPU_WHICH_TID: + case CPU_WHICH_PID: + thread_lock(ttd); + set = cpuset_ref(ttd->td_cpuset); + thread_unlock(ttd); + PROC_UNLOCK(p); + break; + case CPU_WHICH_CPUSET: + case CPU_WHICH_JAIL: + break; + case CPU_WHICH_IRQ: + case CPU_WHICH_INTRHANDLER: + case CPU_WHICH_ITHREAD: + case CPU_WHICH_DOMAIN: + error = EINVAL; + goto out; + } + if (level == CPU_LEVEL_ROOT) + nset = cpuset_refroot(set); + else + nset = cpuset_refbase(set); + error = cpuset_modify_domain(nset, &domain); + cpuset_rel(nset); + cpuset_rel(set); + break; + case CPU_LEVEL_WHICH: + switch (which) { + case CPU_WHICH_TID: + error = _cpuset_setthread(id, NULL, &domain); + break; + case CPU_WHICH_PID: + error = cpuset_setproc(id, NULL, NULL, &domain); + break; + case CPU_WHICH_CPUSET: + case CPU_WHICH_JAIL: + error = cpuset_which(which, id, &p, &ttd, &set); + if (error == 0) { + error = cpuset_modify_domain(set, &domain); + cpuset_rel(set); + } + break; + case CPU_WHICH_IRQ: + case CPU_WHICH_INTRHANDLER: + case CPU_WHICH_ITHREAD: + default: + error = EINVAL; + break; + } + break; + default: + error = EINVAL; + break; + } +out: + free(mask, M_TEMP); + return (error); +} + #ifdef DDB -void -ddb_display_cpuset(const cpuset_t *set) +BITSET_DEFINE(bitset, 1); +static void +ddb_display_bitset(const struct bitset *set, int size) { - int cpu, once; + int bit, once; - for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) { - if (CPU_ISSET(cpu, set)) { + for (once = 0, bit = 0; bit < size; bit++) { + if (CPU_ISSET(bit, set)) { if (once == 0) { - db_printf("%d", cpu); + db_printf("%d", bit); once = 1; } else - db_printf(",%d", cpu); + db_printf(",%d", bit); } } if (once == 0) db_printf(""); } +void +ddb_display_cpuset(const cpuset_t *set) +{ + ddb_display_bitset((const struct bitset *)set, CPU_SETSIZE); +} + +static void +ddb_display_domainset(const domainset_t *set) +{ + ddb_display_bitset((const struct bitset *)set, DOMAINSET_SETSIZE); +} + DB_SHOW_COMMAND(cpusets, db_show_cpusets) { struct cpuset *set; @@ -1341,8 +2091,11 @@ db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n", set, set->cs_id, set->cs_ref, set->cs_flags, (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0); - db_printf(" mask="); + db_printf(" cpu mask="); ddb_display_cpuset(&set->cs_mask); + db_printf("\n"); + db_printf(" domain mask="); + ddb_display_domainset(&set->cs_domain->ds_mask); db_printf("\n"); if (db_pager_quit) break; Index: kern/kern_exit.c =================================================================== --- kern/kern_exit.c +++ kern/kern_exit.c @@ -931,10 +931,6 @@ #ifdef MAC mac_proc_destroy(p); #endif - /* - * Free any domain policy that's still hiding around. - */ - vm_domain_policy_cleanup(&p->p_vm_dom_policy); KASSERT(FIRST_THREAD_IN_PROC(p), ("proc_reap: no residual thread!")); Index: kern/kern_fork.c =================================================================== --- kern/kern_fork.c +++ kern/kern_fork.c @@ -512,14 +512,6 @@ if (p1->p_flag & P_PROFIL) startprofclock(p2); - /* - * Whilst the proc lock is held, copy the VM domain data out - * using the VM domain method. - */ - vm_domain_policy_init(&p2->p_vm_dom_policy); - vm_domain_policy_localcopy(&p2->p_vm_dom_policy, - &p1->p_vm_dom_policy); - if (fr->fr_flags & RFSIGSHARE) { p2->p_sigacts = sigacts_hold(p1->p_sigacts); } else { Index: kern/kern_numa.c =================================================================== --- kern/kern_numa.c +++ kern/kern_numa.c @@ -31,139 +31,16 @@ #include #include #include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - int sys_numa_setaffinity(struct thread *td, struct numa_setaffinity_args *uap) { - int error; - struct vm_domain_policy vp; - struct thread *ttd; - struct proc *p; - struct cpuset *set; - - set = NULL; - p = NULL; - - /* - * Copy in just the policy information into the policy - * struct. Userland only supplies vm_domain_policy_entry. - */ - error = copyin(uap->policy, &vp.p, sizeof(vp.p)); - if (error) - goto out; - - /* - * Ensure the seq number is zero - otherwise seq.h - * may get very confused. - */ - vp.seq = 0; - - /* - * Validate policy. - */ - if (vm_domain_policy_validate(&vp) != 0) { - error = EINVAL; - goto out; - } - - /* - * Go find the desired proc/tid for this operation. - */ - error = cpuset_which(uap->which, uap->id, &p, - &ttd, &set); - if (error) - goto out; - - /* Only handle CPU_WHICH_TID and CPU_WHICH_PID */ - /* - * XXX if cpuset_which is called with WHICH_CPUSET and NULL cpuset, - * it'll return ESRCH. We should just return EINVAL. - */ - switch (uap->which) { - case CPU_WHICH_TID: - vm_domain_policy_copy(&ttd->td_vm_dom_policy, &vp); - break; - case CPU_WHICH_PID: - vm_domain_policy_copy(&p->p_vm_dom_policy, &vp); - break; - default: - error = EINVAL; - break; - } - - PROC_UNLOCK(p); -out: - if (set) - cpuset_rel(set); - return (error); + return (ENOSYS); } int sys_numa_getaffinity(struct thread *td, struct numa_getaffinity_args *uap) { - int error; - struct vm_domain_policy vp; - struct thread *ttd; - struct proc *p; - struct cpuset *set; - - set = NULL; - p = NULL; - - error = cpuset_which(uap->which, uap->id, &p, - &ttd, &set); - if (error) - goto out; - - /* Only handle CPU_WHICH_TID and CPU_WHICH_PID */ - /* - * XXX if cpuset_which is called with WHICH_CPUSET and NULL cpuset, - * it'll return ESRCH. We should just return EINVAL. - */ - switch (uap->which) { - case CPU_WHICH_TID: - vm_domain_policy_localcopy(&vp, &ttd->td_vm_dom_policy); - break; - case CPU_WHICH_PID: - vm_domain_policy_localcopy(&vp, &p->p_vm_dom_policy); - break; - default: - error = EINVAL; - break; - } - if (p) - PROC_UNLOCK(p); - /* - * Copy out only the vm_domain_policy_entry part. - */ - if (error == 0) - error = copyout(&vp.p, uap->policy, sizeof(vp.p)); -out: - if (set) - cpuset_rel(set); - return (error); + return (ENOSYS); } Index: kern/kern_thr.c =================================================================== --- kern/kern_thr.c +++ kern/kern_thr.c @@ -260,12 +260,6 @@ if (p->p_ptevents & PTRACE_LWP) newtd->td_dbgflags |= TDB_BORN; - /* - * Copy the existing thread VM policy into the new thread. - */ - vm_domain_policy_localcopy(&newtd->td_vm_dom_policy, - &td->td_vm_dom_policy); - PROC_UNLOCK(p); tidhash_add(newtd); Index: kern/kern_thread.c =================================================================== --- kern/kern_thread.c +++ kern/kern_thread.c @@ -78,13 +78,13 @@ * structures. */ #ifdef __amd64__ -_Static_assert(offsetof(struct thread, td_flags) == 0xf4, +_Static_assert(offsetof(struct thread, td_flags) == 0xfc, "struct thread KBI td_flags"); -_Static_assert(offsetof(struct thread, td_pflags) == 0xfc, +_Static_assert(offsetof(struct thread, td_pflags) == 0x104, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x460, +_Static_assert(offsetof(struct thread, td_frame) == 0x468, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x508, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x510, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb0, "struct proc KBI p_flag"); @@ -98,13 +98,13 @@ "struct proc KBI p_emuldata"); #endif #ifdef __i386__ -_Static_assert(offsetof(struct thread, td_flags) == 0x9c, +_Static_assert(offsetof(struct thread, td_flags) == 0x100, "struct thread KBI td_flags"); -_Static_assert(offsetof(struct thread, td_pflags) == 0xa4, +_Static_assert(offsetof(struct thread, td_pflags) == 0xa8, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x2ec, +_Static_assert(offsetof(struct thread, td_frame) == 0x2f0, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x338, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x33c, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0x68, "struct proc KBI p_flag"); @@ -413,7 +413,6 @@ return (NULL); } cpu_thread_alloc(td); - vm_domain_policy_init(&td->td_vm_dom_policy); return (td); } @@ -443,7 +442,6 @@ cpu_thread_free(td); if (td->td_kstack != 0) vm_thread_dispose(td); - vm_domain_policy_cleanup(&td->td_vm_dom_policy); callout_drain(&td->td_slpcallout); uma_zfree(thread_zone, td); } Index: kern/makesyscalls.sh =================================================================== --- kern/makesyscalls.sh +++ kern/makesyscalls.sh @@ -139,6 +139,7 @@ printf "#include \n" > sysarg printf "#include \n" > sysarg printf "#include \n" > sysarg + printf "#include \n" > sysarg printf "#include \n" > sysarg printf "#include \n" > sysarg printf "#include \n" > sysarg Index: kern/sched_4bsd.c =================================================================== --- kern/sched_4bsd.c +++ kern/sched_4bsd.c @@ -781,6 +781,7 @@ childtd->td_lastcpu = NOCPU; childtd->td_lock = &sched_lock; childtd->td_cpuset = cpuset_ref(td->td_cpuset); + child->td_domain.dr_policy = td->td_cpuset->cs_domain; childtd->td_priority = childtd->td_base_pri; ts = td_get_sched(childtd); bzero(ts, sizeof(*ts)); Index: kern/sched_ule.c =================================================================== --- kern/sched_ule.c +++ kern/sched_ule.c @@ -2131,6 +2131,7 @@ child->td_lastcpu = NOCPU; child->td_lock = TDQ_LOCKPTR(tdq); child->td_cpuset = cpuset_ref(td->td_cpuset); + child->td_domain.dr_policy = td->td_cpuset->cs_domain; ts2->ts_cpu = ts->ts_cpu; ts2->ts_flags = 0; /* Index: kern/syscalls.c =================================================================== --- kern/syscalls.c +++ kern/syscalls.c @@ -567,4 +567,6 @@ "fhstatfs", /* 558 = fhstatfs */ "mknodat", /* 559 = mknodat */ "kevent", /* 560 = kevent */ + "cpuset_getdomain", /* 561 = cpuset_getdomain */ + "cpuset_setdomain", /* 562 = cpuset_setdomain */ }; Index: kern/syscalls.master =================================================================== --- kern/syscalls.master +++ kern/syscalls.master @@ -1023,6 +1023,14 @@ struct kevent *changelist, int nchanges, \ struct kevent *eventlist, int nevents, \ const struct timespec *timeout); } +561 AUE_NULL STD { int cpuset_getdomain(cpulevel_t level, \ + cpuwhich_t which, id_t id, \ + size_t domainsetsize, domainset_t *mask, \ + int *policy); } +562 AUE_NULL STD { int cpuset_setdomain(cpulevel_t level, \ + cpuwhich_t which, id_t id, \ + size_t domainsetsize, domainset_t *mask, \ + int policy); } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master Index: kern/systrace_args.c =================================================================== --- kern/systrace_args.c +++ kern/systrace_args.c @@ -3276,6 +3276,30 @@ *n_args = 6; break; } + /* cpuset_getdomain */ + case 561: { + struct cpuset_getdomain_args *p = params; + iarg[0] = p->level; /* cpulevel_t */ + iarg[1] = p->which; /* cpuwhich_t */ + iarg[2] = p->id; /* id_t */ + uarg[3] = p->domainsetsize; /* size_t */ + uarg[4] = (intptr_t) p->mask; /* domainset_t * */ + uarg[5] = (intptr_t) p->policy; /* int * */ + *n_args = 6; + break; + } + /* cpuset_setdomain */ + case 562: { + struct cpuset_setdomain_args *p = params; + iarg[0] = p->level; /* cpulevel_t */ + iarg[1] = p->which; /* cpuwhich_t */ + iarg[2] = p->id; /* id_t */ + uarg[3] = p->domainsetsize; /* size_t */ + uarg[4] = (intptr_t) p->mask; /* domainset_t * */ + iarg[5] = p->policy; /* int */ + *n_args = 6; + break; + } default: *n_args = 0; break; @@ -8728,6 +8752,56 @@ break; }; break; + /* cpuset_getdomain */ + case 561: + switch(ndx) { + case 0: + p = "cpulevel_t"; + break; + case 1: + p = "cpuwhich_t"; + break; + case 2: + p = "id_t"; + break; + case 3: + p = "size_t"; + break; + case 4: + p = "userland domainset_t *"; + break; + case 5: + p = "userland int *"; + break; + default: + break; + }; + break; + /* cpuset_setdomain */ + case 562: + switch(ndx) { + case 0: + p = "cpulevel_t"; + break; + case 1: + p = "cpuwhich_t"; + break; + case 2: + p = "id_t"; + break; + case 3: + p = "size_t"; + break; + case 4: + p = "userland domainset_t *"; + break; + case 5: + p = "int"; + break; + default: + break; + }; + break; default: break; }; @@ -10611,6 +10685,16 @@ break; /* kevent */ case 560: + if (ndx == 0 || ndx == 1) + p = "int"; + break; + /* cpuset_getdomain */ + case 561: + if (ndx == 0 || ndx == 1) + p = "int"; + break; + /* cpuset_setdomain */ + case 562: if (ndx == 0 || ndx == 1) p = "int"; break; Index: sys/_domainset.h =================================================================== --- sys/_domainset.h +++ sys/_domainset.h @@ -0,0 +1,62 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017, Jeffrey Roberson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS__DOMAINSET_H_ +#define _SYS__DOMAINSET_H_ + +#include + +#ifdef _KERNEL +#define DOMAINSET_SETSIZE MAXMEMDOM +#endif + +#define DOMAINSET_MAXSIZE 256 + +#ifndef DOMAINSET_SETSIZE +#define DOMAINSET_SETSIZE DOMAINSET_MAXSIZE +#endif + +BITSET_DEFINE(_domainset, DOMAINSET_SETSIZE); +typedef struct _domainset domainset_t; + +#ifdef _KERNEL +/* + * This structure is intended to be embedded in objects which have policy + * attributes. Each object keeps its own iterator so round-robin is + * synchronized and accurate. + */ +struct domainset; +struct domainset_ref { + struct domainset * volatile dr_policy; + int dr_iterator; +}; +#endif + +#endif /* !_SYS__DOMAINSET_H_ */ Index: sys/cpuset.h =================================================================== --- sys/cpuset.h +++ sys/cpuset.h @@ -112,6 +112,7 @@ */ struct cpuset { cpuset_t cs_mask; /* bitmask of valid cpus. */ + struct domainset *cs_domain; /* (c) NUMA policy. */ volatile u_int cs_ref; /* (a) Reference count. */ int cs_flags; /* (s) Flags from below. */ cpusetid_t cs_id; /* (s) Id or INVALID. */ Index: sys/domainset.h =================================================================== --- sys/domainset.h +++ sys/domainset.h @@ -0,0 +1,95 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017, Jeffrey Roberson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_DOMAINSETSET_H_ +#define _SYS_DOMAINSETSET_H_ + +#include + +#include + +#define _NDOMAINSETBITS _BITSET_BITS +#define _NDOMAINSETWORDS __bitset_words(DOMAINSET_SETSIZE) + +#define DOMAINSETSETBUFSIZ ((2 + sizeof(long) * 2) * _NDOMAINSETWORDS) + +#define DOMAINSET_CLR(n, p) BIT_CLR(DOMAINSET_SETSIZE, n, p) +#define DOMAINSET_COPY(f, t) BIT_COPY(DOMAINSET_SETSIZE, f, t) +#define DOMAINSET_ISSET(n, p) BIT_ISSET(DOMAINSET_SETSIZE, n, p) +#define DOMAINSET_SET(n, p) BIT_SET(DOMAINSET_SETSIZE, n, p) +#define DOMAINSET_ZERO(p) BIT_ZERO(DOMAINSET_SETSIZE, p) +#define DOMAINSET_FILL(p) BIT_FILL(DOMAINSET_SETSIZE, p) +#define DOMAINSET_SETOF(n, p) BIT_SETOF(DOMAINSET_SETSIZE, n, p) +#define DOMAINSET_EMPTY(p) BIT_EMPTY(DOMAINSET_SETSIZE, p) +#define DOMAINSET_ISFULLSET(p) BIT_ISFULLSET(DOMAINSET_SETSIZE, p) +#define DOMAINSET_SUBSET(p, c) BIT_SUBSET(DOMAINSET_SETSIZE, p, c) +#define DOMAINSET_OVERLAP(p, c) BIT_OVERLAP(DOMAINSET_SETSIZE, p, c) +#define DOMAINSET_CMP(p, c) BIT_CMP(DOMAINSET_SETSIZE, p, c) +#define DOMAINSET_OR(d, s) BIT_OR(DOMAINSET_SETSIZE, d, s) +#define DOMAINSET_AND(d, s) BIT_AND(DOMAINSET_SETSIZE, d, s) +#define DOMAINSET_NAND(d, s) BIT_NAND(DOMAINSET_SETSIZE, d, s) +#define DOMAINSET_CLR_ATOMIC(n, p) BIT_CLR_ATOMIC(DOMAINSET_SETSIZE, n, p) +#define DOMAINSET_SET_ATOMIC(n, p) BIT_SET_ATOMIC(DOMAINSET_SETSIZE, n, p) +#define DOMAINSET_SET_ATOMIC_ACQ(n, p) \ + BIT_SET_ATOMIC_ACQ(DOMAINSET_SETSIZE, n, p) +#define DOMAINSET_AND_ATOMIC(n, p) BIT_AND_ATOMIC(DOMAINSET_SETSIZE, n, p) +#define DOMAINSET_OR_ATOMIC(d, s) BIT_OR_ATOMIC(DOMAINSET_SETSIZE, d, s) +#define DOMAINSET_COPY_STORE_REL(f, t) \ + BIT_COPY_STORE_REL(DOMAINSET_SETSIZE, f, t) +#define DOMAINSET_FFS(p) BIT_FFS(DOMAINSET_SETSIZE, p) +#define DOMAINSET_FLS(p) BIT_FLS(DOMAINSET_SETSIZE, p) +#define DOMAINSET_COUNT(p) BIT_COUNT(DOMAINSET_SETSIZE, p) +#define DOMAINSET_FSET BITSET_FSET(_NDOMAINSETWORDS) +#define DOMAINSET_T_INITIALIZER BITSET_T_INITIALIZER + +#define DOMAINSET_POLICY_INVALID 0 +#define DOMAINSET_POLICY_ROUNDROBIN 1 +#define DOMAINSET_POLICY_FIRSTTOUCH 2 +#define DOMAINSET_POLICY_MAX DOMAINSET_POLICY_FIRSTTOUCH + +#ifdef _KERNEL +#include +LIST_HEAD(domainlist, domainset); + +struct domainset { + LIST_ENTRY(domainset) ds_link; + domainset_t ds_mask; /* allowed domains. */ + uint16_t ds_cnt; /* popcnt from above. */ + uint16_t ds_max; /* Maximum domain in set. */ + uint16_t ds_policy; /* Policy type. */ +}; + +void domainset_zero(void); + +#else +__BEGIN_DECLS +__END_DECLS +#endif +#endif /* !_SYS_DOMAINSETSET_H_ */ Index: sys/param.h =================================================================== --- sys/param.h +++ sys/param.h @@ -60,7 +60,7 @@ * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1200054 /* Master, propagated to newvers */ +#define __FreeBSD_version 1200055 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, Index: sys/proc.h =================================================================== --- sys/proc.h +++ sys/proc.h @@ -40,6 +40,7 @@ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ +#include #include /* For struct callout. */ #include /* For struct klist. */ #include @@ -63,9 +64,11 @@ #else #include #endif +#include #include #include -#include +#include +#include #include /* Machine-dependent proc substruct. */ /* @@ -220,12 +223,12 @@ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ LIST_ENTRY(thread) td_hash; /* (d) Hash chain. */ struct cpuset *td_cpuset; /* (t) CPU affinity mask. */ + struct domainset_ref td_domain; /* (a) NUMA policy */ struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ - struct vm_domain_policy td_vm_dom_policy; /* (c) current numa domain policy */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ #define td_siglist td_sigqueue.sq_signals @@ -284,7 +287,6 @@ pid_t td_dbg_forked; /* (c) Child pid for debugger. */ u_int td_vp_reserv; /* (k) Count of reserved vnodes. */ int td_no_sleeping; /* (k) Sleeping disabled count. */ - int td_dom_rr_idx; /* (k) RR Numa domain selection. */ void *td_su; /* (k) FFS SU private */ sbintime_t td_sleeptimo; /* (t) Sleep timeout. */ int td_rtcgen; /* (s) rtc_generation of abs. sleep */ @@ -653,7 +655,6 @@ uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ int p_throttled; /* (c) Flag for racct pcpu throttling */ - struct vm_domain_policy p_vm_dom_policy; /* (c) process default VM domain, or -1 */ /* * An orphan is the child that has beed re-parented to the * debugger as a result of attaching to it. Need to keep Index: sys/syscall.h =================================================================== --- sys/syscall.h +++ sys/syscall.h @@ -478,4 +478,6 @@ #define SYS_fhstatfs 558 #define SYS_mknodat 559 #define SYS_kevent 560 -#define SYS_MAXSYSCALL 561 +#define SYS_cpuset_getdomain 561 +#define SYS_cpuset_setdomain 562 +#define SYS_MAXSYSCALL 563 Index: sys/syscall.mk =================================================================== --- sys/syscall.mk +++ sys/syscall.mk @@ -405,4 +405,6 @@ getfsstat.o \ fhstatfs.o \ mknodat.o \ - kevent.o + kevent.o \ + cpuset_getdomain.o \ + cpuset_setdomain.o Index: sys/syscallsubr.h =================================================================== --- sys/syscallsubr.h +++ sys/syscallsubr.h @@ -36,6 +36,7 @@ #include #include #include +#include struct file; struct filecaps; @@ -96,6 +97,12 @@ int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, const cpuset_t *maskp); +int kern_cpuset_getdomain(struct thread *td, cpulevel_t level, + cpuwhich_t which, id_t id, size_t domainsetsize, + domainset_t *maskp, int *policyp); +int kern_cpuset_setdomain(struct thread *td, cpulevel_t level, + cpuwhich_t which, id_t id, size_t domainsetsize, + const domainset_t *maskp, int policy); int kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, cpusetid_t *setid); int kern_cpuset_setid(struct thread *td, cpuwhich_t which, Index: sys/sysproto.h =================================================================== --- sys/sysproto.h +++ sys/sysproto.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -1761,6 +1762,22 @@ char nevents_l_[PADL_(int)]; int nevents; char nevents_r_[PADR_(int)]; char timeout_l_[PADL_(const struct timespec *)]; const struct timespec * timeout; char timeout_r_[PADR_(const struct timespec *)]; }; +struct cpuset_getdomain_args { + char level_l_[PADL_(cpulevel_t)]; cpulevel_t level; char level_r_[PADR_(cpulevel_t)]; + char which_l_[PADL_(cpuwhich_t)]; cpuwhich_t which; char which_r_[PADR_(cpuwhich_t)]; + char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)]; + char domainsetsize_l_[PADL_(size_t)]; size_t domainsetsize; char domainsetsize_r_[PADR_(size_t)]; + char mask_l_[PADL_(domainset_t *)]; domainset_t * mask; char mask_r_[PADR_(domainset_t *)]; + char policy_l_[PADL_(int *)]; int * policy; char policy_r_[PADR_(int *)]; +}; +struct cpuset_setdomain_args { + char level_l_[PADL_(cpulevel_t)]; cpulevel_t level; char level_r_[PADR_(cpulevel_t)]; + char which_l_[PADL_(cpuwhich_t)]; cpuwhich_t which; char which_r_[PADR_(cpuwhich_t)]; + char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)]; + char domainsetsize_l_[PADL_(size_t)]; size_t domainsetsize; char domainsetsize_r_[PADR_(size_t)]; + char mask_l_[PADL_(domainset_t *)]; domainset_t * mask; char mask_r_[PADR_(domainset_t *)]; + char policy_l_[PADL_(int)]; int policy; char policy_r_[PADR_(int)]; +}; int nosys(struct thread *, struct nosys_args *); void sys_sys_exit(struct thread *, struct sys_exit_args *); int sys_fork(struct thread *, struct fork_args *); @@ -2141,6 +2158,8 @@ int sys_fhstatfs(struct thread *, struct fhstatfs_args *); int sys_mknodat(struct thread *, struct mknodat_args *); int sys_kevent(struct thread *, struct kevent_args *); +int sys_cpuset_getdomain(struct thread *, struct cpuset_getdomain_args *); +int sys_cpuset_setdomain(struct thread *, struct cpuset_setdomain_args *); #ifdef COMPAT_43 @@ -3033,6 +3052,8 @@ #define SYS_AUE_fhstatfs AUE_FHSTATFS #define SYS_AUE_mknodat AUE_MKNODAT #define SYS_AUE_kevent AUE_KEVENT +#define SYS_AUE_cpuset_getdomain AUE_NULL +#define SYS_AUE_cpuset_setdomain AUE_NULL #undef PAD_ #undef PADL_ Index: vm/uma_core.c =================================================================== --- vm/uma_core.c +++ vm/uma_core.c @@ -78,7 +78,6 @@ #include #include -#include #include #include #include Index: vm/vm_domainset.h =================================================================== --- vm/vm_domainset.h +++ vm/vm_domainset.h @@ -0,0 +1,47 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017, Jeffrey Roberson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +#ifndef __VM_DOMAINSET_H__ +#define __VM_DOMAINSET_H__ + +struct vm_domainset_iter { + struct domainset *di_domain; + int *di_iter; + int di_flags; + int di_n; +}; + +int vm_domainset_iter_page(struct vm_domainset_iter *, int *, int *); +void vm_domainset_iter_page_init(struct vm_domainset_iter *, + struct vm_object *, int *, int *); +int vm_domainset_iter_malloc(struct vm_domainset_iter *, int *, int *); +void vm_domainset_iter_malloc_init(struct vm_domainset_iter *, + struct vm_object *, int *, int *); + +#endif /* __VM_DOMAINSET_H__ */ Index: vm/vm_domainset.c =================================================================== --- vm/vm_domainset.c +++ vm/vm_domainset.c @@ -0,0 +1,214 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017, Jeffrey Roberson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_vm.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * Iterators are written such that the first nowait pass has as short a + * codepath as possible to eliminate bloat from the allocator. It is + * assumed that most allocations are successful. + */ + +/* + * Determine which policy is to be used for this allocation. + */ +static void +vm_domainset_iter_domain(struct vm_domainset_iter *di, struct vm_object *obj) +{ + struct domainset *domain; + + /* + * object policy takes precedence over thread policy. The policies + * are immutable and unsychronized. Updates can race but pointer + * loads are assumed to be atomic. + */ + if (obj != NULL && (domain = obj->domain.dr_policy) != NULL) { + di->di_domain = domain; + di->di_iter = &obj->domain.dr_iterator; + } else { + di->di_domain = curthread->td_domain.dr_policy; + di->di_iter = &curthread->td_domain.dr_iterator; + } +} + +static void +vm_domainset_iter_rr(struct vm_domainset_iter *di, int *domain) +{ + int d; + + d = *di->di_iter; + do { + d = (d + 1) % di->di_domain->ds_max; + } while (!DOMAINSET_ISSET(d, &di->di_domain->ds_mask)); + *di->di_iter = *domain = d; +} + +static void +vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain) +{ + + switch (di->di_domain->ds_policy) { + case DOMAINSET_POLICY_FIRSTTOUCH: + /* FALLTHROUGH */ + case DOMAINSET_POLICY_ROUNDROBIN: + vm_domainset_iter_rr(di, domain); + break; + default: + panic("vm_domainset_iter_first: Unknown policy %d", + di->di_domain->ds_policy); + } + di->di_n++; + KASSERT(*domain < vm_ndomains, + ("vm_domainset_iter_next: Invalid domain %d", *domain)); +} + +static void +vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain) +{ + + di->di_n = 0; + switch (di->di_domain->ds_policy) { + case DOMAINSET_POLICY_FIRSTTOUCH: + *domain = PCPU_GET(domain); + /* + * To prevent impossible allocations we convert an invalid + * first-touch to round-robin. + */ + if (DOMAINSET_ISSET(*domain, &di->di_domain->ds_mask)) + break; + /* FALLTHROUGH */ + case DOMAINSET_POLICY_ROUNDROBIN: + vm_domainset_iter_rr(di, domain); + break; + default: + panic("vm_domainset_iter_first: Unknown policy %d", + di->di_domain->ds_policy); + } + KASSERT(*domain < vm_ndomains, + ("vm_domainset_iter_first: Invalid domain %d", *domain)); +} + +void +vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, + int *domain, int *req) +{ + + vm_domainset_iter_domain(di, obj); + di->di_flags = *req; + *req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) | + VM_ALLOC_NOWAIT; + vm_domainset_iter_first(di, domain); +} + +int +vm_domainset_iter_page(struct vm_domainset_iter *di, int *domain, int *req) +{ + + /* + * If we exhausted all options with NOWAIT and did a WAITFAIL it + * is time to return an error to the caller. + */ + if ((*req & VM_ALLOC_WAITFAIL) != 0) + return (ENOMEM); + + /* If there are more domains to visit we run the iterator. */ + if (++di->di_n != di->di_domain->ds_cnt) { + vm_domainset_iter_next(di, domain); + return (0); + } + + /* If we visited all domains and this was a NOWAIT we return error. */ + if ((di->di_flags & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) == 0) + return (ENOMEM); + + /* + * We have visited all domains with non-blocking allocations, try + * from the beginning with a blocking allocation. + */ + vm_domainset_iter_first(di, domain); + *req = di->di_flags; + + return (0); +} + + +void +vm_domainset_iter_malloc_init(struct vm_domainset_iter *di, + struct vm_object *obj, int *domain, int *flags) +{ + + vm_domainset_iter_domain(di, obj); + di->di_flags = *flags; + *flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT; + vm_domainset_iter_first(di, domain); +} + +int +vm_domainset_iter_malloc(struct vm_domainset_iter *di, int *domain, int *flags) +{ + + /* If there are more domains to visit we run the iterator. */ + if (++di->di_n != di->di_domain->ds_cnt) { + vm_domainset_iter_next(di, domain); + return (0); + } + + /* If we visited all domains and this was a NOWAIT we return error. */ + if ((di->di_flags & M_WAITOK) == 0) + return (ENOMEM); + + /* + * We have visited all domains with non-blocking allocations, try + * from the beginning with a blocking allocation. + */ + vm_domainset_iter_first(di, domain); + *flags = di->di_flags; + + return (0); +} Index: vm/vm_kern.c =================================================================== --- vm/vm_kern.c +++ vm/vm_kern.c @@ -72,6 +72,7 @@ #include #include #include /* for ticks and hz */ +#include #include #include #include @@ -82,8 +83,8 @@ #include #include -#include #include +#include #include #include #include @@ -221,28 +222,20 @@ kmem_alloc_attr(vmem_t *vmem, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr) { - struct vm_domain_iterator vi; + struct vm_domainset_iter di; vm_offset_t addr; - int domain, wait; + int domain; KASSERT(vmem == kernel_arena, ("kmem_alloc_attr: Only kernel_arena is supported.")); - addr = 0; - vm_policy_iterator_init(&vi); - wait = flags & M_WAITOK; - flags &= ~M_WAITOK; - flags |= M_NOWAIT; - while (vm_domain_iterator_run(&vi, &domain) == 0) { - if (vm_domain_iterator_isdone(&vi) && wait) { - flags |= wait; - flags &= ~M_NOWAIT; - } + + vm_domainset_iter_malloc_init(&di, kernel_object, &domain, &flags); + do { addr = kmem_alloc_attr_domain(domain, size, flags, low, high, memattr); if (addr != 0) break; - } - vm_policy_iterator_finish(&vi); + } while (vm_domainset_iter_malloc(&di, &domain, &flags) == 0); return (addr); } @@ -317,28 +310,20 @@ vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { - struct vm_domain_iterator vi; + struct vm_domainset_iter di; vm_offset_t addr; - int domain, wait; + int domain; KASSERT(vmem == kernel_arena, ("kmem_alloc_contig: Only kernel_arena is supported.")); - addr = 0; - vm_policy_iterator_init(&vi); - wait = flags & M_WAITOK; - flags &= ~M_WAITOK; - flags |= M_NOWAIT; - while (vm_domain_iterator_run(&vi, &domain) == 0) { - if (vm_domain_iterator_isdone(&vi) && wait) { - flags |= wait; - flags &= ~M_NOWAIT; - } + + vm_domainset_iter_malloc_init(&di, kernel_object, &domain, &flags); + do { addr = kmem_alloc_contig_domain(domain, size, flags, low, high, alignment, boundary, memattr); if (addr != 0) break; - } - vm_policy_iterator_finish(&vi); + } while (vm_domainset_iter_malloc(&di, &domain, &flags) == 0); return (addr); } @@ -408,27 +393,19 @@ vm_offset_t kmem_malloc(struct vmem *vmem, vm_size_t size, int flags) { - struct vm_domain_iterator vi; + struct vm_domainset_iter di; vm_offset_t addr; - int domain, wait; + int domain; KASSERT(vmem == kernel_arena, ("kmem_malloc: Only kernel_arena is supported.")); - addr = 0; - vm_policy_iterator_init(&vi); - wait = flags & M_WAITOK; - flags &= ~M_WAITOK; - flags |= M_NOWAIT; - while (vm_domain_iterator_run(&vi, &domain) == 0) { - if (vm_domain_iterator_isdone(&vi) && wait) { - flags |= wait; - flags &= ~M_NOWAIT; - } + + vm_domainset_iter_malloc_init(&di, kernel_object, &domain, &flags); + do { addr = kmem_malloc_domain(domain, size, flags); if (addr != 0) break; - } - vm_policy_iterator_finish(&vi); + } while (vm_domainset_iter_malloc(&di, &domain, &flags) == 0); return (addr); } @@ -494,26 +471,19 @@ int kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags) { - struct vm_domain_iterator vi; - int domain, wait, ret; + struct vm_domainset_iter di; + int domain; + int ret; KASSERT(object == kernel_object, ("kmem_back: only supports kernel object.")); - ret = 0; - vm_policy_iterator_init(&vi); - wait = flags & M_WAITOK; - flags &= ~M_WAITOK; - flags |= M_NOWAIT; - while (vm_domain_iterator_run(&vi, &domain) == 0) { - if (vm_domain_iterator_isdone(&vi) && wait) { - flags |= wait; - flags &= ~M_NOWAIT; - } + + vm_domainset_iter_malloc_init(&di, kernel_object, &domain, &flags); + do { ret = kmem_back_domain(domain, object, addr, size, flags); if (ret == KERN_SUCCESS) break; - } - vm_policy_iterator_finish(&vi); + } while (vm_domainset_iter_malloc(&di, &domain, &flags) == 0); return (ret); } Index: vm/vm_object.h =================================================================== --- vm/vm_object.h +++ vm/vm_object.h @@ -74,6 +74,7 @@ #include #include #include +#include #include @@ -102,6 +103,7 @@ struct pglist memq; /* list of resident pages */ struct vm_radix rtree; /* root of the resident page radix trie*/ vm_pindex_t size; /* Object size */ + struct domainset_ref domain; /* NUMA policy. */ int generation; /* generation ID */ int ref_count; /* How many refs?? */ int shadow_count; /* how many objects that this is a shadow for */ Index: vm/vm_page.c =================================================================== --- vm/vm_page.c +++ vm/vm_page.c @@ -91,6 +91,7 @@ #include #include #include +#include #include #include #include @@ -109,7 +110,7 @@ #include #include #include -#include +#include #include #include #include @@ -742,6 +743,12 @@ */ vm_reserv_init(); #endif + /* + * Set an initial domain policy for thread0 so that allocations + * can work. + */ + domainset_zero(); + return (vaddr); } @@ -1624,23 +1631,17 @@ vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, int req, vm_page_t mpred) { - struct vm_domain_iterator vi; + struct vm_domainset_iter di; vm_page_t m; - int domain, wait; + int domain; - m = NULL; - vm_policy_iterator_init(&vi); - wait = req & (VM_ALLOC_WAITFAIL | VM_ALLOC_WAITOK); - req &= ~wait; - while (vm_domain_iterator_run(&vi, &domain) == 0) { - if (vm_domain_iterator_isdone(&vi)) - req |= wait; + vm_domainset_iter_page_init(&di, object, &domain, &req); + do { m = vm_page_alloc_domain_after(object, pindex, domain, req, mpred); if (m != NULL) break; - } - vm_policy_iterator_finish(&vi); + } while (vm_domainset_iter_page(&di, &domain, &req) == 0); return (m); } @@ -1839,23 +1840,17 @@ u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { - struct vm_domain_iterator vi; + struct vm_domainset_iter di; vm_page_t m; - int domain, wait; + int domain; - m = NULL; - vm_policy_iterator_init(&vi); - wait = req & (VM_ALLOC_WAITFAIL | VM_ALLOC_WAITOK); - req &= ~wait; - while (vm_domain_iterator_run(&vi, &domain) == 0) { - if (vm_domain_iterator_isdone(&vi)) - req |= wait; + vm_domainset_iter_page_init(&di, object, &domain, &req); + do { m = vm_page_alloc_contig_domain(object, pindex, domain, req, npages, low, high, alignment, boundary, memattr); if (m != NULL) break; - } - vm_policy_iterator_finish(&vi); + } while (vm_domainset_iter_page(&di, &domain, &req) == 0); return (m); } @@ -2051,22 +2046,16 @@ vm_page_t vm_page_alloc_freelist(int flind, int req) { - struct vm_domain_iterator vi; + struct vm_domainset_iter di; vm_page_t m; - int domain, wait; + int domain; - m = NULL; - vm_policy_iterator_init(&vi); - wait = req & (VM_ALLOC_WAITFAIL | VM_ALLOC_WAITOK); - req &= ~wait; - while (vm_domain_iterator_run(&vi, &domain) == 0) { - if (vm_domain_iterator_isdone(&vi)) - req |= wait; + vm_domainset_iter_page_init(&di, NULL, &domain, &req); + do { m = vm_page_alloc_freelist_domain(domain, flind, req); if (m != NULL) break; - } - vm_policy_iterator_finish(&vi); + } while (vm_domainset_iter_page(&di, &domain, &req) == 0); return (m); } @@ -2655,19 +2644,17 @@ vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { - struct vm_domain_iterator vi; + struct vm_domainset_iter di; int domain; bool ret; - ret = false; - vm_policy_iterator_init(&vi); - while ((vm_domain_iterator_run(&vi, &domain)) == 0) { + vm_domainset_iter_page_init(&di, NULL, &domain, &req); + do { ret = vm_page_reclaim_contig_domain(domain, req, npages, low, high, alignment, boundary); if (ret) break; - } - vm_policy_iterator_finish(&vi); + } while (vm_domainset_iter_page(&di, &domain, &req) == 0); return (ret); } Index: vm/vm_phys.c =================================================================== --- vm/vm_phys.c +++ vm/vm_phys.c @@ -68,8 +68,6 @@ #include #include -#include - _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, "Too many physsegs.");