Index: head/share/man/man9/Makefile =================================================================== --- head/share/man/man9/Makefile +++ head/share/man/man9/Makefile @@ -118,6 +118,7 @@ disk.9 \ dnv.9 \ domain.9 \ + domainset.9 \ dpcpu.9 \ drbr.9 \ driver.9 \ Index: head/share/man/man9/domainset.9 =================================================================== --- head/share/man/man9/domainset.9 +++ head/share/man/man9/domainset.9 @@ -0,0 +1,128 @@ +.\" Copyright (c) 2018 Jeffrey Roberson +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' +.\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE +.\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +.\" POSSIBILITY OF SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd March 24, 2018 +.Dt DOMAINSET 9 +.Os +.Sh NAME +.Nm domainset(9) +\(em +.Nm domainset_create , +.Nm sysctl_handle_domainset . +.Nd domainset functions and operation +.Sh SYNOPSIS +.In sys/_domainset.h +.In sys/domainset.h +.\" +.Bd -literal -offset indent +struct domainset { + domainset_t ds_mask; + uint16_t ds_policy; + domainid_t ds_prefer; + ... +}; +.Ed +.Pp +.Ft struct domainset * +.Fn domainset_create "const struct domainset *key" +.Ft int +.Fn sysctl_handle_domainset "SYSCTL_HANDLER_ARGS" +.Sh DESCRIPTION +The +.Nm +API provides memory domain allocation policy for NUMA machines. +Each +.Vt domainset +contains a bitmask of allowed domains, an integer policy, and an optional +preferred domain. +Together, these specify a search order for memory allocations as well as +the ability to restrict threads and objects to a subset of available +memory domains for system partitioning and resource management. +.Pp +Every thread in the system and optionally every +.Vt vm_object_t , +which is used to represent files and other memory sources, has +a reference to a +.Vt struct domainset . +The domainset associated with the object is consulted first and the system +falls back to the thread policy if none exists. +.Pp +The allocation policy has the following possible values: +.Bl -tag -width "foo" +.It Dv DOMAINSET_POLICY_ROUNDROBIN +Memory is allocated from each domain in the mask in a round-robin fashion. +This distributes bandwidth evenly among available domains. +This policy can specify a single domain for a fixed allocation. +.It Dv DOMAINSET_POLICY_FIRSTTOUCH +Memory is allocated from the node that it is first accessed on. +Allocation falls back to round-robin if the current domain is not in the +allowed set or is out of memory. +This policy optimizes for locality but may give pessimal results if the +memory is accessed from many CPUs that are not in the local domain. +.It Dv DOMAINSET_POLICY_PREFER +Memory is allocated from the node in the +.Vt prefer +member. The preferred node must be set in the allowed mask. +If the preferred node is out of memory the allocation falls back to +round-robin among allowed sets. +.It Dv DOMAINSET_POLICY_INTERLEAVE +Memory is allocated in a striped fashion with multiple pages +allocated to each domain in the set according to the offset within +the object. +The strip width is object dependent and may be as large as a +super-page (2MB on amd64). +This gives good distribution among memory domains while keeping system +efficiency higher and is preferential to round-robin for general use. +.El +.Pp +The +.Fn domainset_create +function takes a partially filled in domainset as a key and returns a +valid domainset or NULL. +It is critical that consumers not use domainsets that have not been +returned by this function. +.Vt +domainset +is an immutable type that is shared among all matching keys and must +not be modified after return. +.Pp +The +.Fn sysctl_handle_domainset +function is provided as a convenience for modifying or viewing domainsets +that are not accessible via +.Xr cpuset 2 . +It is intended for use with +.Xr sysctl 9 . +.Pp +.Sh SEE ALSO +.Xr cpuset 1 , +.Xr cpuset 2 , +.Xr cpuset_setdomain 2 , +.Xr bitset 9 +.Sh HISTORY +.In sys/domainset.h +first appeared in +.Fx 12.0 . Index: head/sys/kern/kern_cpuset.c =================================================================== --- head/sys/kern/kern_cpuset.c +++ head/sys/kern/kern_cpuset.c @@ -37,6 +37,8 @@ #include #include +#include +#include #include #include #include @@ -63,9 +65,7 @@ #include #include #include -#include -#include -#include +#include #ifdef DDB #include @@ -112,13 +112,17 @@ * meaning 'curthread'. It may query available cpus for that tid with a * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...). */ + +LIST_HEAD(domainlist, domainset); + static uma_zone_t cpuset_zone; static uma_zone_t domainset_zone; static struct mtx cpuset_lock; static struct setlist cpuset_ids; static struct domainlist cpuset_domains; static struct unrhdr *cpuset_unr; -static struct cpuset *cpuset_zero, *cpuset_default; +static struct cpuset *cpuset_zero, *cpuset_default, *cpuset_kernel; +static struct domainset domainset0, domainset2; /* Return the size of cpuset_t at the kernel level */ SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD, @@ -445,6 +449,7 @@ _domainset_create(struct domainset *domain, struct domainlist *freelist) { struct domainset *ndomain; + int i, j, max; mtx_lock_spin(&cpuset_lock); LIST_FOREACH(ndomain, &cpuset_domains, ds_link) @@ -457,7 +462,10 @@ if (ndomain == NULL) { LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link); domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask); - domain->ds_max = DOMAINSET_FLS(&domain->ds_mask) + 1; + max = DOMAINSET_FLS(&domain->ds_mask) + 1; + for (i = 0, j = 0; i < max; i++) + if (DOMAINSET_ISSET(i, &domain->ds_mask)) + domain->ds_order[j++] = i; } mtx_unlock_spin(&cpuset_lock); if (ndomain == NULL) @@ -473,11 +481,24 @@ /* * Create or lookup a domainset based on the key held in 'domain'. */ -static struct domainset * +struct domainset * domainset_create(const struct domainset *domain) { struct domainset *ndomain; + /* + * Validate the policy. It must specify a useable policy number with + * only valid domains. Preferred must include the preferred domain + * in the mask. + */ + if (domain->ds_policy <= DOMAINSET_POLICY_INVALID || + domain->ds_policy > DOMAINSET_POLICY_MAX) + return (NULL); + if (domain->ds_policy == DOMAINSET_POLICY_PREFER && + !DOMAINSET_ISSET(domain->ds_prefer, &domain->ds_mask)) + return (NULL); + if (!DOMAINSET_SUBSET(&domainset0.ds_mask, &domain->ds_mask)) + return (NULL); ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO); domainset_copy(domain, ndomain); return _domainset_create(ndomain, NULL); @@ -507,7 +528,7 @@ PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); - kernel_object->domain.dr_policy = cpuset_default->cs_domain; + kernel_object->domain.dr_policy = cpuset_kernel->cs_domain; } /* @@ -1128,6 +1149,55 @@ return (error); } +static int +bitset_strprint(char *buf, size_t bufsiz, const struct bitset *set, int setlen) +{ + size_t bytes; + int i, once; + char *p; + + once = 0; + p = buf; + for (i = 0; i < __bitset_words(setlen); i++) { + if (once != 0) { + if (bufsiz < 1) + return (0); + *p = ','; + p++; + bufsiz--; + } else + once = 1; + if (bufsiz < sizeof(__STRING(ULONG_MAX))) + return (0); + bytes = snprintf(p, bufsiz, "%lx", set->__bits[i]); + p += bytes; + bufsiz -= bytes; + } + return (p - buf); +} + +static int +bitset_strscan(struct bitset *set, int setlen, const char *buf) +{ + int i, ret; + const char *p; + + BIT_ZERO(setlen, set); + p = buf; + for (i = 0; i < __bitset_words(setlen); i++) { + if (*p == ',') { + p++; + continue; + } + ret = sscanf(p, "%lx", &set->__bits[i]); + if (ret == 0 || ret == -1) + break; + while (isxdigit(*p)) + p++; + } + return (p - buf); +} + /* * Return a string representing a valid layout for a cpuset_t object. * It expects an incoming buffer at least sized as CPUSETBUFSIZ. @@ -1135,19 +1205,9 @@ char * cpusetobj_strprint(char *buf, const cpuset_t *set) { - char *tbuf; - size_t i, bytesp, bufsiz; - tbuf = buf; - bytesp = 0; - bufsiz = CPUSETBUFSIZ; - - for (i = 0; i < (_NCPUWORDS - 1); i++) { - bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]); - bufsiz -= bytesp; - tbuf += bytesp; - } - snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]); + bitset_strprint(buf, CPUSETBUFSIZ, (const struct bitset *)set, + CPU_SETSIZE); return (buf); } @@ -1158,37 +1218,71 @@ int cpusetobj_strscan(cpuset_t *set, const char *buf) { - u_int nwords; - int i, ret; + char p; if (strlen(buf) > CPUSETBUFSIZ - 1) return (-1); - /* Allow to pass a shorter version of the mask when necessary. */ - nwords = 1; - for (i = 0; buf[i] != '\0'; i++) - if (buf[i] == ',') - nwords++; - if (nwords > _NCPUWORDS) + p = buf[bitset_strscan((struct bitset *)set, CPU_SETSIZE, buf)]; + if (p != '\0') return (-1); - CPU_ZERO(set); - for (i = 0; i < (nwords - 1); i++) { - ret = sscanf(buf, "%lx,", &set->__bits[i]); - if (ret == 0 || ret == -1) - return (-1); - buf = strstr(buf, ","); - if (buf == NULL) - return (-1); - buf++; - } - ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]); - if (ret == 0 || ret == -1) - return (-1); return (0); } /* + * Handle a domainset specifier in the sysctl tree. A poiner to a pointer to + * a domainset is in arg1. If the user specifies a valid domainset the + * pointer is updated. + * + * Format is: + * hex mask word 0,hex mask word 1,...:decimal policy:decimal preferred + */ +int +sysctl_handle_domainset(SYSCTL_HANDLER_ARGS) +{ + char buf[DOMAINSETBUFSIZ]; + struct domainset *dset; + struct domainset key; + int policy, prefer, error; + char *p; + + dset = *(struct domainset **)arg1; + error = 0; + + if (dset != NULL) { + p = buf + bitset_strprint(buf, DOMAINSETBUFSIZ, + (const struct bitset *)&dset->ds_mask, DOMAINSET_SETSIZE); + sprintf(p, ":%d:%d", dset->ds_policy, dset->ds_prefer); + } else + sprintf(buf, ""); + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + + /* + * Read in and validate the string. + */ + memset(&key, 0, sizeof(key)); + p = &buf[bitset_strscan((struct bitset *)&key.ds_mask, + DOMAINSET_SETSIZE, buf)]; + if (p == buf) + return (EINVAL); + if (sscanf(p, ":%d:%d", &policy, &prefer) != 2) + return (EINVAL); + key.ds_policy = policy; + key.ds_prefer = prefer; + + /* Domainset_create() validates the policy.*/ + dset = domainset_create(&key); + if (dset == NULL) + return (EINVAL); + *(struct domainset **)arg1 = dset; + + return (error); +} + +/* * Apply an anonymous mask or a domain to a single thread. */ static int @@ -1239,95 +1333,19 @@ int cpuset_setithread(lwpid_t id, int cpu) { - struct setlist cpusets; - struct cpuset *nset, *rset; - struct cpuset *parent, *old_set; - struct thread *td; - struct proc *p; - cpusetid_t cs_id; cpuset_t mask; - int error; - cpuset_freelist_init(&cpusets, 1); - rset = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); - cs_id = CPUSET_INVALID; - CPU_ZERO(&mask); if (cpu == NOCPU) CPU_COPY(cpuset_root, &mask); else CPU_SET(cpu, &mask); - - error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &old_set); - if (error != 0 || ((cs_id = alloc_unr(cpuset_unr)) == CPUSET_INVALID)) - goto out; - - /* cpuset_which() returns with PROC_LOCK held. */ - old_set = td->td_cpuset; - - if (cpu == NOCPU) { - nset = LIST_FIRST(&cpusets); - LIST_REMOVE(nset, cs_link); - - /* - * roll back to default set. We're not using cpuset_shadow() - * here because we can fail CPU_SUBSET() check. This can happen - * if default set does not contain all CPUs. - */ - error = _cpuset_create(nset, cpuset_default, &mask, NULL, - CPUSET_INVALID); - - goto applyset; - } - - if (old_set->cs_id == 1 || (old_set->cs_id == CPUSET_INVALID && - old_set->cs_parent->cs_id == 1)) { - - /* - * Current set is either default (1) or - * shadowed version of default set. - * - * Allocate new root set to be able to shadow it - * with any mask. - */ - error = _cpuset_create(rset, cpuset_zero, - &cpuset_zero->cs_mask, NULL, cs_id); - if (error != 0) { - PROC_UNLOCK(p); - goto out; - } - rset->cs_flags |= CPU_SET_ROOT; - parent = rset; - rset = NULL; - cs_id = CPUSET_INVALID; - } else { - /* Assume existing set was already allocated by previous call */ - parent = old_set; - old_set = NULL; - } - - error = cpuset_shadow(parent, &nset, &mask, NULL, &cpusets, NULL); -applyset: - if (error == 0) { - thread_lock(td); - old_set = cpuset_update_thread(td, nset); - thread_unlock(td); - } else - old_set = NULL; - PROC_UNLOCK(p); - if (old_set != NULL) - cpuset_rel(old_set); -out: - cpuset_freelist_free(&cpusets); - if (rset != NULL) - uma_zfree(cpuset_zone, rset); - if (cs_id != CPUSET_INVALID) - free_unr(cpuset_unr, cs_id); - return (error); + return _cpuset_setthread(id, &mask, NULL); } -static struct domainset domainset0; - +/* + * Create the domainset for cpuset 0, 1 and cpuset 2. + */ void domainset_zero(void) { @@ -1340,14 +1358,17 @@ DOMAINSET_ZERO(&dset->ds_mask); for (i = 0; i < vm_ndomains; i++) DOMAINSET_SET(i, &dset->ds_mask); - dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN; + dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH; dset->ds_prefer = -1; curthread->td_domain.dr_policy = _domainset_create(dset, NULL); - kernel_object->domain.dr_policy = curthread->td_domain.dr_policy; + + domainset_copy(dset, &domainset2); + domainset2.ds_policy = DOMAINSET_POLICY_INTERLEAVE; + kernel_object->domain.dr_policy = _domainset_create(&domainset2, NULL); } /* - * Creates system-wide cpusets and the cpuset for thread0 including two + * Creates system-wide cpusets and the cpuset for thread0 including three * sets: * * 0 - The root set which should represent all valid processors in the @@ -1357,6 +1378,8 @@ * 1 - The default set which all processes are a member of until changed. * This allows an administrator to move all threads off of given cpus to * dedicate them to high priority tasks or save power etc. + * 2 - The kernel set which allows restriction and policy to be applied only + * to kernel threads and the kernel_object. */ struct cpuset * cpuset_thread0(void) @@ -1366,12 +1389,12 @@ int i; cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL, - NULL, NULL, UMA_ALIGN_PTR, 0); + NULL, NULL, UMA_ALIGN_CACHE, 0); domainset_zone = uma_zcreate("domainset", sizeof(struct domainset), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); /* - * Create the root system set for the whole machine. Doesn't use + * Create the root system set (0) for the whole machine. Doesn't use * cpuset_create() due to NULL parent. */ set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); @@ -1385,12 +1408,20 @@ cpuset_root = &set->cs_mask; /* - * Now derive a default, modifiable set from that to give out. + * Now derive a default (1), modifiable set from that to give out. */ set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); error = _cpuset_create(set, cpuset_zero, NULL, NULL, 1); KASSERT(error == 0, ("Error creating default set: %d\n", error)); cpuset_default = set; + /* + * Create the kernel set (2). + */ + set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); + error = _cpuset_create(set, cpuset_zero, NULL, NULL, 2); + KASSERT(error == 0, ("Error creating kernel set: %d\n", error)); + set->cs_domain = &domainset2; + cpuset_kernel = set; /* * Initialize the unit allocator. 0 and 1 are allocated above. @@ -1407,9 +1438,21 @@ CPU_COPY(&all_cpus, &cpuset_domain[0]); domains_set: - return (set); + return (cpuset_default); } +void +cpuset_kernthread(struct thread *td) +{ + struct cpuset *set; + + thread_lock(td); + set = td->td_cpuset; + td->td_cpuset = cpuset_ref(cpuset_kernel); + thread_unlock(td); + cpuset_rel(set); +} + /* * Create a cpuset, which would be cpuset_create() but * mark the new 'set' as root. @@ -2108,7 +2151,7 @@ } #ifdef DDB -BITSET_DEFINE(bitset, 1); + static void ddb_display_bitset(const struct bitset *set, int size) { @@ -2164,9 +2207,8 @@ struct domainset *set; LIST_FOREACH(set, &cpuset_domains, ds_link) { - db_printf("set=%p policy %d prefer %d cnt %d max %d\n", - set, set->ds_policy, set->ds_prefer, set->ds_cnt, - set->ds_max); + db_printf("set=%p policy %d prefer %d cnt %d\n", + set, set->ds_policy, set->ds_prefer, set->ds_cnt); db_printf(" mask ="); ddb_display_domainset(&set->ds_mask); db_printf("\n"); Index: head/sys/kern/kern_kthread.c =================================================================== --- head/sys/kern/kern_kthread.c +++ head/sys/kern/kern_kthread.c @@ -131,7 +131,7 @@ cpu_fork_kthread_handler(td, func, arg); /* Avoid inheriting affinity from a random parent. */ - cpuset_setthread(td->td_tid, cpuset_root); + cpuset_kernthread(td); thread_lock(td); TD_SET_CAN_RUN(td); sched_prio(td, PVM); @@ -309,7 +309,7 @@ tidhash_add(newtd); /* Avoid inheriting affinity from a random parent. */ - cpuset_setthread(newtd->td_tid, cpuset_root); + cpuset_kernthread(newtd); /* Delay putting it on the run queue until now. */ if (!(flags & RFSTOPPED)) { Index: head/sys/sys/_bitset.h =================================================================== --- head/sys/sys/_bitset.h +++ head/sys/sys/_bitset.h @@ -57,4 +57,10 @@ */ #define BITSET_DEFINE_VAR(t) BITSET_DEFINE(t, 1) +/* + * Define a default type that can be used while manually specifying size + * to every call. + */ +BITSET_DEFINE(bitset, 1); + #endif /* !_SYS__BITSET_H_ */ Index: head/sys/sys/cpuset.h =================================================================== --- head/sys/sys/cpuset.h +++ head/sys/sys/cpuset.h @@ -139,6 +139,7 @@ int cpuset_setproc_update_set(struct proc *, struct cpuset *); int cpuset_which(cpuwhich_t, id_t, struct proc **, struct thread **, struct cpuset **); +void cpuset_kernthread(struct thread *); char *cpusetobj_strprint(char *, const cpuset_t *); int cpusetobj_strscan(cpuset_t *, const char *); Index: head/sys/sys/domainset.h =================================================================== --- head/sys/sys/domainset.h +++ head/sys/sys/domainset.h @@ -28,8 +28,8 @@ * $FreeBSD$ */ -#ifndef _SYS_DOMAINSETSET_H_ -#define _SYS_DOMAINSETSET_H_ +#ifndef _SYS_DOMAINSET_H_ +#define _SYS_DOMAINSET_H_ #include @@ -38,8 +38,12 @@ #define _NDOMAINSETBITS _BITSET_BITS #define _NDOMAINSETWORDS __bitset_words(DOMAINSET_SETSIZE) -#define DOMAINSETSETBUFSIZ ((2 + sizeof(long) * 2) * _NDOMAINSETWORDS) +#define DOMAINSETBUFSIZ \ + (((2 + sizeof(long) * 2) * _NDOMAINSETWORDS) + \ + sizeof("::") + sizeof(__XSTRING(DOMAINSET_POLICY_MAX)) + \ + sizeof(__XSTRING(MAXMEMDOM))) + #define DOMAINSET_CLR(n, p) BIT_CLR(DOMAINSET_SETSIZE, n, p) #define DOMAINSET_COPY(f, t) BIT_COPY(DOMAINSET_SETSIZE, f, t) #define DOMAINSET_ISSET(n, p) BIT_ISSET(DOMAINSET_SETSIZE, n, p) @@ -73,23 +77,37 @@ #define DOMAINSET_POLICY_ROUNDROBIN 1 #define DOMAINSET_POLICY_FIRSTTOUCH 2 #define DOMAINSET_POLICY_PREFER 3 -#define DOMAINSET_POLICY_MAX DOMAINSET_POLICY_PREFER +#define DOMAINSET_POLICY_INTERLEAVE 4 +#define DOMAINSET_POLICY_MAX DOMAINSET_POLICY_INTERLEAVE #ifdef _KERNEL -#include -LIST_HEAD(domainlist, domainset); +#if MAXMEMDOM < 256 +typedef uint8_t domainid_t; +#else +typedef uint16_t domainid_t; +#endif struct domainset { LIST_ENTRY(domainset) ds_link; domainset_t ds_mask; /* allowed domains. */ uint16_t ds_policy; /* Policy type. */ - int16_t ds_prefer; /* Preferred domain or -1. */ - uint16_t ds_cnt; /* popcnt from above. */ - uint16_t ds_max; /* Maximum domain in set. */ + domainid_t ds_prefer; /* Preferred domain or -1. */ + domainid_t ds_cnt; /* popcnt from above. */ + domainid_t ds_order[MAXMEMDOM]; /* nth domain table. */ }; void domainset_zero(void); +/* + * Add a domainset to the system based on a key initializing policy, prefer, + * and mask. Do not create and directly use domainset structures. The + * returned value will not match the key pointer. + */ +struct domainset *domainset_create(const struct domainset *); +#ifdef _SYS_SYSCTL_H_ +int sysctl_handle_domainset(SYSCTL_HANDLER_ARGS); +#endif + #else __BEGIN_DECLS int cpuset_getdomain(cpulevel_t, cpuwhich_t, id_t, size_t, domainset_t *, @@ -99,4 +117,4 @@ __END_DECLS #endif -#endif /* !_SYS_DOMAINSETSET_H_ */ +#endif /* !_SYS_DOMAINSET_H_ */ Index: head/sys/sys/proc.h =================================================================== --- head/sys/sys/proc.h +++ head/sys/sys/proc.h @@ -67,7 +67,7 @@ #include #include #include -#include +#include #include /* Machine-dependent proc substruct. */ #ifdef _KERNEL Index: head/sys/vm/vm_domainset.h =================================================================== --- head/sys/vm/vm_domainset.h +++ head/sys/vm/vm_domainset.h @@ -33,13 +33,15 @@ struct vm_domainset_iter { struct domainset *di_domain; int *di_iter; + vm_pindex_t di_offset; + int di_policy; int di_flags; int di_n; }; int vm_domainset_iter_page(struct vm_domainset_iter *, int *, int *); void vm_domainset_iter_page_init(struct vm_domainset_iter *, - struct vm_object *, int *, int *); + struct vm_object *, vm_pindex_t, int *, int *); int vm_domainset_iter_malloc(struct vm_domainset_iter *, int *, int *); void vm_domainset_iter_malloc_init(struct vm_domainset_iter *, struct vm_object *, int *, int *); Index: head/sys/vm/vm_domainset.c =================================================================== --- head/sys/vm/vm_domainset.c +++ head/sys/vm/vm_domainset.c @@ -56,11 +56,14 @@ * assumed that most allocations are successful. */ +static int vm_domainset_default_stride = 64; + /* * Determine which policy is to be used for this allocation. */ static void -vm_domainset_iter_domain(struct vm_domainset_iter *di, struct vm_object *obj) +vm_domainset_iter_init(struct vm_domainset_iter *di, struct vm_object *obj, + vm_pindex_t pindex) { struct domainset *domain; @@ -76,18 +79,33 @@ di->di_domain = curthread->td_domain.dr_policy; di->di_iter = &curthread->td_domain.dr_iterator; } + di->di_policy = di->di_domain->ds_policy; + if (di->di_policy == DOMAINSET_POLICY_INTERLEAVE) { + if (vm_object_reserv(obj)) { + /* + * Color the pindex so we end up on the correct + * reservation boundary. + */ + pindex += obj->pg_color; + pindex >>= VM_LEVEL_0_ORDER; + } else + pindex /= vm_domainset_default_stride; + /* + * Offset pindex so the first page of each object does + * not end up in domain 0. + */ + if (obj != NULL) + pindex += (((uintptr_t)obj) / sizeof(*obj)); + di->di_offset = pindex; + } } static void vm_domainset_iter_rr(struct vm_domainset_iter *di, int *domain) { - int d; - d = *di->di_iter; - do { - d = (d + 1) % di->di_domain->ds_max; - } while (!DOMAINSET_ISSET(d, &di->di_domain->ds_mask)); - *di->di_iter = *domain = d; + *domain = di->di_domain->ds_order[ + ++(*di->di_iter) % di->di_domain->ds_cnt]; } static void @@ -95,27 +113,38 @@ { int d; - d = *di->di_iter; do { - d = (d + 1) % di->di_domain->ds_max; - } while (!DOMAINSET_ISSET(d, &di->di_domain->ds_mask) || - d == di->di_domain->ds_prefer); - *di->di_iter = *domain = d; + d = di->di_domain->ds_order[ + ++(*di->di_iter) % di->di_domain->ds_cnt]; + } while (d == di->di_domain->ds_prefer); + *domain = d; } static void +vm_domainset_iter_interleave(struct vm_domainset_iter *di, int *domain) +{ + int d; + + d = di->di_offset % di->di_domain->ds_cnt; + *di->di_iter = d; + *domain = di->di_domain->ds_order[d]; +} + +static void vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain) { KASSERT(di->di_n > 0, ("vm_domainset_iter_first: Invalid n %d", di->di_n)); - switch (di->di_domain->ds_policy) { + switch (di->di_policy) { case DOMAINSET_POLICY_FIRSTTOUCH: /* * To prevent impossible allocations we convert an invalid * first-touch to round-robin. */ /* FALLTHROUGH */ + case DOMAINSET_POLICY_INTERLEAVE: + /* FALLTHROUGH */ case DOMAINSET_POLICY_ROUNDROBIN: vm_domainset_iter_rr(di, domain); break; @@ -124,7 +153,7 @@ break; default: panic("vm_domainset_iter_first: Unknown policy %d", - di->di_domain->ds_policy); + di->di_policy); } KASSERT(*domain < vm_ndomains, ("vm_domainset_iter_next: Invalid domain %d", *domain)); @@ -134,11 +163,15 @@ vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain) { - switch (di->di_domain->ds_policy) { + switch (di->di_policy) { case DOMAINSET_POLICY_FIRSTTOUCH: *domain = PCPU_GET(domain); if (DOMAINSET_ISSET(*domain, &di->di_domain->ds_mask)) { - di->di_n = 1; + /* + * Add an extra iteration because we will visit the + * current domain a second time in the rr iterator. + */ + di->di_n = di->di_domain->ds_cnt + 1; break; } /* @@ -154,9 +187,13 @@ *domain = di->di_domain->ds_prefer; di->di_n = di->di_domain->ds_cnt; break; + case DOMAINSET_POLICY_INTERLEAVE: + vm_domainset_iter_interleave(di, domain); + di->di_n = di->di_domain->ds_cnt; + break; default: panic("vm_domainset_iter_first: Unknown policy %d", - di->di_domain->ds_policy); + di->di_policy); } KASSERT(di->di_n > 0, ("vm_domainset_iter_first: Invalid n %d", di->di_n)); @@ -166,10 +203,10 @@ void vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, - int *domain, int *req) + vm_pindex_t pindex, int *domain, int *req) { - vm_domainset_iter_domain(di, obj); + vm_domainset_iter_init(di, obj, pindex); di->di_flags = *req; *req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) | VM_ALLOC_NOWAIT; @@ -213,7 +250,9 @@ struct vm_object *obj, int *domain, int *flags) { - vm_domainset_iter_domain(di, obj); + vm_domainset_iter_init(di, obj, 0); + if (di->di_policy == DOMAINSET_POLICY_INTERLEAVE) + di->di_policy = DOMAINSET_POLICY_ROUNDROBIN; di->di_flags = *flags; *flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT; vm_domainset_iter_first(di, domain); @@ -253,7 +292,7 @@ void vm_domainset_iter_page_init(struct vm_domainset_iter *di, - struct vm_object *obj, int *domain, int *flags) + struct vm_object *obj, vm_pindex_t pindex, int *domain, int *flags) { *domain = 0; Index: head/sys/vm/vm_page.c =================================================================== --- head/sys/vm/vm_page.c +++ head/sys/vm/vm_page.c @@ -1660,7 +1660,7 @@ vm_page_t m; int domain; - vm_domainset_iter_page_init(&di, object, &domain, &req); + vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); do { m = vm_page_alloc_domain_after(object, pindex, domain, req, mpred); @@ -1893,7 +1893,7 @@ vm_page_t m; int domain; - vm_domainset_iter_page_init(&di, object, &domain, &req); + vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); do { m = vm_page_alloc_contig_domain(object, pindex, domain, req, npages, low, high, alignment, boundary, memattr); @@ -2092,7 +2092,7 @@ vm_page_t m; int domain; - vm_domainset_iter_page_init(&di, kernel_object, &domain, &req); + vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { m = vm_page_alloc_freelist_domain(domain, freelist, req); if (m != NULL) @@ -2691,7 +2691,7 @@ int domain; bool ret; - vm_domainset_iter_page_init(&di, kernel_object, &domain, &req); + vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { ret = vm_page_reclaim_contig_domain(domain, req, npages, low, high, alignment, boundary); Index: head/sys/vm/vnode_pager.c =================================================================== --- head/sys/vm/vnode_pager.c +++ head/sys/vm/vnode_pager.c @@ -59,6 +59,7 @@ #include #include +#include #include #include #include @@ -69,6 +70,7 @@ #include #include #include +#include #include @@ -108,6 +110,12 @@ int vnode_pbuf_freecnt; int vnode_async_pbuf_freecnt; +static struct domainset *vnode_domainset = NULL; + +SYSCTL_PROC(_debug, OID_AUTO, vnode_domainset, CTLTYPE_STRING | CTLFLAG_RW, + &vnode_domainset, 0, sysctl_handle_domainset, "A", + "Default vnode NUMA policy"); + /* Create the VM system backing object for this vnode */ int vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td) @@ -241,6 +249,7 @@ object->un_pager.vnp.vnp_size = size; object->un_pager.vnp.writemappings = 0; + object->domain.dr_policy = vnode_domainset; object->handle = handle; VI_LOCK(vp); Index: head/usr.bin/cpuset/cpuset.1 =================================================================== --- head/usr.bin/cpuset/cpuset.1 +++ head/usr.bin/cpuset/cpuset.1 @@ -145,13 +145,21 @@ A special list of .Dq all may be specified in which case the list includes all CPUs from the root set. -.It Fl n Ar domain-list:policy +.It Fl n Ar policy:domain-list Specifies a list of domains and allocation policy to apply to a target. Ranges may be specified as in .Fl l . -Valid policies include first-touch, ft, round-robin, rr, and prefer. The prefer -policy accepts only a single domain in the set. The parent of the set is -consulted if the preferred domain is unavailable. +Valid policies include first-touch (ft), round-robin (rr), prefer and +interleave (il). +First-touch allocates on the local domain when memory is available. +Round-robin alternates between every possible domain page at at time. +The prefer policy accepts only a single domain in the set. +The parent of the set is consulted if the preferred domain is unavailable. +Interleave operates like round-robin with an implementation defined stripe +width. +See +.Xr domainset 9 +for more details on policies. .It Fl p Ar pid Specifies a pid as the target of the operation. .It Fl s Ar setid Index: head/usr.bin/cpuset/cpuset.c =================================================================== --- head/usr.bin/cpuset/cpuset.c +++ head/usr.bin/cpuset/cpuset.c @@ -79,10 +79,11 @@ { "first-touch", DOMAINSET_POLICY_FIRSTTOUCH }, { "ft", DOMAINSET_POLICY_FIRSTTOUCH }, { "prefer", DOMAINSET_POLICY_PREFER }, + { "interleave", DOMAINSET_POLICY_INTERLEAVE}, + { "il", DOMAINSET_POLICY_INTERLEAVE}, { NULL, DOMAINSET_POLICY_INVALID } }; -BITSET_DEFINE(bitset, 1); static void printset(struct bitset *mask, int size); static void @@ -237,7 +238,7 @@ "domain" }; static const char *levelnames[] = { NULL, " root", " cpuset", "" }; static const char *policynames[] = { "invalid", "round-robin", "first-touch", - "prefer" }; + "prefer", "interleave" }; static void printaffinity(void)