Index: lib/libc/sys/Makefile.inc =================================================================== --- lib/libc/sys/Makefile.inc +++ lib/libc/sys/Makefile.inc @@ -235,6 +235,7 @@ nanosleep.2 \ nfssvc.2 \ ntp_adjtime.2 \ + numa_getaffinity.2 \ open.2 \ pathconf.2 \ pdfork.2 \ @@ -395,6 +396,7 @@ MLINKS+=mq_receive.2 mq_timedreceive.2 MLINKS+=mq_send.2 mq_timedsend.2 MLINKS+=ntp_adjtime.2 ntp_gettime.2 +MLINKS+=numa_getaffinity.2 numa_setaffinity.2 MLINKS+=open.2 openat.2 MLINKS+=pathconf.2 fpathconf.2 MLINKS+=pathconf.2 lpathconf.2 Index: lib/libc/sys/Symbol.map =================================================================== --- lib/libc/sys/Symbol.map +++ lib/libc/sys/Symbol.map @@ -400,6 +400,8 @@ futimens; ppoll; utimensat; + numa_setaffinity; + numa_getaffinity; }; FBSDprivate_1.0 { Index: lib/libc/sys/numa_getaffinity.2 =================================================================== --- /dev/null +++ lib/libc/sys/numa_getaffinity.2 @@ -0,0 +1,198 @@ +.\" Copyright (c) 2008 Christian Brueffer +.\" Copyright (c) 2008 Jeffrey Roberson +.\" Copyright (c) 2015 Adrian Chadd +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd May 7, 2015 +.Dt NUMA_GETAFFINITY 2 +.Os +.Sh NAME +.Nm numa_getaffinity , +.Nm numa_setaffinity +.Nd manage NUMA affinity +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In sys/param.h +.In sys/numa.h +.Ft int +.Fn numa_getaffinity "cpuwhich_t which" "id_t id" "struct vm_domain_policy *policy" +.Ft int +.Fn numa_setaffinity "cpuwhich_t which" "id_t id" "const struct vm_domain_policy *policy" +.Sh DESCRIPTION +.Fn numa_getaffinity +and +.Fn numa_setaffinity +allow the manipulation of NUMA policies available to processes and threads. +These functions may manipulate NUMA policies that contain many processes +or affect only a single object. +.Pp +Valid values for the +.Fa which +argument are documented in +.Xr cpuset 2 . +These arguments specify which object set are used. +Only +.Dv CPU_WHICH_TID +and +.Dv CPU_WHICH_PID +can be manipulated. +.Pp +The +.Fa policy +entry contains a vm_domain_policy with the following fields: +.Bd -literal +struct vm_domain_policy { + seq_t seq; /* sequence number, used internally by kernel */ + vm_domain_policy_type_t policy; /* VM policy */ + int domain; /* VM domain, if applicable */ +} +.Ed +.Fa vm_domain_policy_type_t policy +is one these: +.Bl -tag -width VM_POLICY_NONE +.It Dv VM_POLICY_NONE +Reset the domain back to none. +Any parent object NUMA domain policy will apply. +The only valid value for +.Dv domain +is -1. +.It Dv VM_POLICY_ROUND_ROBIN +Select round-robin policy. +Pages will be allocated round-robin from each VM domain in order. +The only valid value for +.Dv domain +is -1. +.It Dv VM_POLICY_FIXED_DOMAIN +Select fixed-domain only policy. +Pages will be allocated from the given +.Dv domain +which must be set to a valid VM domain. +Pages will not be allocated from another domain if +.Dv domain +is out of free pages. +.It Dv VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN +Select fixed-domain only policy. +Pages will be allocated from +.Dv domain +which must be set to a valid VM domain. +If page allocation fails, pages will be round-robin +allocated from another domain if +.Dv domain +is out of free pages. +.It Dv VM_POLICY_FIRST_TOUCH +Select first-touch policy. +Pages will be allocated from the NUMA domain which the thread +is currently scheduled upon. +Pages will not be allocated from another domain if the current domain +is out of free pages. +The only valid value for +.Dv domain +is -1. +.It Dv VM_POLICY_FIRST_TOUCH_ROUND_ROBIN +Select first-touch policy. +Pages will be allocated from the NUMA domain which the thread +is currently scheduled upon. +Pages will be allocated round-robin from another domain if the +current domain is out of free pages. +The only valid value for +.Dv domain +is -1. +.El +.Pp +Note that the VM might assign some pages from other domains. +For example, if an existing page allocation is covered by a superpage +allocation. +.Pp +.Fn numa_getaffinity +retrieves the +NUMA policy from the object specified by +.Fa which +and +.Fa id +and stores it in the space provided by +.Fa policy . +.Pp +.Fn numa_setaffinity +attempts to set the NUMA policy for the object specified by +.Fa which +and +.Fa id +to the policy in +.Fa policy . +.Sh RETURN VALUES +.Rv -std +.Sh ERRORS +.Va errno +can contain these error codes: +.Bl -tag -width Er +.It Bq Er EINVAL +The +.Fa level +or +.Fa which +argument was not a valid value. +.It Bq Er EINVAL +The +.Fa policy +argument specified when calling +.Fn numa_setaffinity +did not contain a valid policy. +.It Bq Er EFAULT +The policy pointer passed was invalid. +.It Bq Er ESRCH +The object specified by the +.Fa id +and +.Fa which +arguments could not be found. +.It Bq Er ERANGE +The +.Fa domain +in the given policy +was out of the range of possible VM domains available. +.It Bq Er EPERM +The calling process did not have the credentials required to complete the +operation. +.El +.Sh SEE ALSO +.Xr cpuset 1 , +.Xr numactl 1 , +.Xr cpuset 2 , +.Xr cpuset_getaffinity 2 , +.Xr cpuset_getid 2 , +.Xr cpuset_setaffinity 2 , +.Xr cpuset_setid 2 , +.Xr pthread_affinity_np 3 , +.Xr pthread_attr_affinity_np 3 , +.Xr numa 4 +.Sh HISTORY +The +.Nm +family of system calls first appeared in +.Fx 11.0 . +.Sh AUTHORS +.An Adrian Chadd Aq Mt adrian@FreeBSD.org Index: share/man/man4/Makefile =================================================================== --- share/man/man4/Makefile +++ share/man/man4/Makefile @@ -363,6 +363,7 @@ nsp.4 \ ${_ntb.4} \ null.4 \ + numa.4 \ ${_nvd.4} \ ${_nvme.4} \ ${_nvram.4} \ Index: share/man/man4/numa.4 =================================================================== --- /dev/null +++ share/man/man4/numa.4 @@ -0,0 +1,172 @@ +.\" Copyright (c) 2015 Adrian Chadd +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd May 10, 2015 +.Dt NUMA 4 +.Os +.Sh NAME +.Nm NUMA +.Nd Non-Uniform Memory Access +.Sh SYNOPSIS +.Cd options SMP +.Cd options MAXMEMDOM=16 +.Pp +.In sys/numa.h +.In sys/cpuset.h +.In sys/bus.h +.Sh DESCRIPTION +Non-Uniform Memory Access is a computer architecture design which +involves unequal costs between processors, memory and IO devices +in a given system. +.Pp +In a +.Nm +architecture, the latency to access specific memory or IO devices +depends upon which processor the memory or device is attached to. +Accessing memory local to a processor is faster than accessing memory +that is connected to one of the other processors. +.Pp +.Nm +is enabled when the +.Cd MAXMEMDOM +option is used in a kernel configuration +file and is set to a value greater than 1. +.Pp +Thread and process +.Nm +policies are controlled with the +.Xr numa_setaffinity 2 +and +.Xr numa_getaffinity 2 +syscalls. +.Pp +The +.Xr numactl 1 +tool is available for starting processes with a non-default +policy, or to change the policy of an existing thread or process. +.Pp +Systems with non-uniform access to I/O devices may mark those devices +with the local VM domain identifier. +Drivers can find out their local domain information by calling +.Xr bus_get_domain 9 . +.Ss MIB Variables +The operation of +.Nm +is controlled and exposes information with these +.Xr sysctl 8 +MIB variables: +.Pp +.Bl -tag -width indent -compact +.It Va vm.ndomains +The number of VM domains which have been detected. +.Pp +.It Va vm.default_policy +The default VM domain allocation policy. +Defaults to "first-touch-rr". +The valid values are "first-touch", "first-touch-rr", +"rr", where "rr" is a short-hand for "round-robin." +See +.Xr numa_setaffinity 2 +for more information about the available policies. +.Pp +.It Va vm.phys_locality +A table indicating the relative cost of each VM domain to each other. +A value of 10 indicates equal cost. +A value of -1 means the locality map is not available or no +locality information is available. +.Pp +.It Va vm.phys_segs +The map of physical memory, grouped by VM domain. +.El +.Sh IMPLEMENTATION NOTES +The current +.Nm +implementation is VM-focused. +The hardware +.Nm +domains are mapped into a contiguous, non-sparse +VM domain space, starting from 0. +Thus, VM domain information (for example, the domain identifier) is not +necessarily the same as is found in the hardware specific information. +.Pp +The +.Nm +allocation policies are implemented as a policy and iterator in +.Pa sys/vm/vm_domain.c +and +.Pa sys/vm/vm_domain.h . +Policy information is available in both struct thread and struct proc. +Processes inherit +.Nm +policy from parent processes and threads inherit +.Nm +policy from parent threads. +Note that threads do not explicitly inherit their +.Nm +policy from processes. +Instead, if no thread policy is set, the system +will fall back to the process policy. +.Pp +For now, +.Nm +domain policies only influence physical page allocation in +.Pa sys/vm/vm_phys.c . +This is useful for userland memory allocation, but not for kernel +and driver memory allocation. +These features will be implemented in future work. +.Sh SEE ALSO +.Xr numactl 1 , +.Xr numa_getaffinity 2 , +.Xr numa_setaffinity 2 , +.Xr bus_get_domain 9 +.Sh HISTORY +.Nm +first appeared in +.Fx 9.0 +as a first-touch allocation policy with a fail-over to round-robin allocation +and was not configurable. +It was then modified in +.Fx 10.0 +to implement a round-robin allocation policy and was also not configurable. +.Pp +The +.Xr numa_getaffinity 2 +and +.Xr numa_setaffinity 2 +syscalls first appeared in +.Fx 11.0 . +.Pp +The +.Xr numactl 1 +tool first appeared in +.Fx 11.0 . +.Sh AUTHORS +This manual page written by +.An Adrian Chadd Aq Mt adrian@FreeBSD.org . +.Sh NOTES +No statistics are kept to indicate how often +.Nm +allocation policies succeed or fail. Index: sys/compat/freebsd32/freebsd32_syscall.h =================================================================== --- sys/compat/freebsd32/freebsd32_syscall.h +++ sys/compat/freebsd32/freebsd32_syscall.h @@ -455,4 +455,6 @@ #define FREEBSD32_SYS_freebsd32_ppoll 545 #define FREEBSD32_SYS_freebsd32_futimens 546 #define FREEBSD32_SYS_freebsd32_utimensat 547 -#define FREEBSD32_SYS_MAXSYSCALL 548 +#define FREEBSD32_SYS_numa_getaffinity 548 +#define FREEBSD32_SYS_numa_setaffinity 549 +#define FREEBSD32_SYS_MAXSYSCALL 550 Index: sys/compat/freebsd32/freebsd32_syscalls.c =================================================================== --- sys/compat/freebsd32/freebsd32_syscalls.c +++ sys/compat/freebsd32/freebsd32_syscalls.c @@ -581,4 +581,6 @@ "freebsd32_ppoll", /* 545 = freebsd32_ppoll */ "freebsd32_futimens", /* 546 = freebsd32_futimens */ "freebsd32_utimensat", /* 547 = freebsd32_utimensat */ + "numa_getaffinity", /* 548 = numa_getaffinity */ + "numa_setaffinity", /* 549 = numa_setaffinity */ }; Index: sys/compat/freebsd32/freebsd32_sysent.c =================================================================== --- sys/compat/freebsd32/freebsd32_sysent.c +++ sys/compat/freebsd32/freebsd32_sysent.c @@ -618,4 +618,6 @@ { AS(freebsd32_ppoll_args), (sy_call_t *)freebsd32_ppoll, AUE_POLL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 545 = freebsd32_ppoll */ { AS(freebsd32_futimens_args), (sy_call_t *)freebsd32_futimens, AUE_FUTIMES, NULL, 0, 0, 0, SY_THR_STATIC }, /* 546 = freebsd32_futimens */ { AS(freebsd32_utimensat_args), (sy_call_t *)freebsd32_utimensat, AUE_FUTIMESAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 547 = freebsd32_utimensat */ + { AS(numa_getaffinity_args), (sy_call_t *)sys_numa_getaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 548 = numa_getaffinity */ + { AS(numa_setaffinity_args), (sy_call_t *)sys_numa_setaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 549 = numa_setaffinity */ }; Index: sys/compat/freebsd32/freebsd32_systrace_args.c =================================================================== --- sys/compat/freebsd32/freebsd32_systrace_args.c +++ sys/compat/freebsd32/freebsd32_systrace_args.c @@ -3327,6 +3327,24 @@ *n_args = 4; break; } + /* numa_getaffinity */ + case 548: { + struct numa_getaffinity_args *p = params; + iarg[0] = p->which; /* cpuwhich_t */ + iarg[1] = p->id; /* id_t */ + uarg[2] = (intptr_t) p->policy; /* struct vm_domain_policy * */ + *n_args = 3; + break; + } + /* numa_setaffinity */ + case 549: { + struct numa_setaffinity_args *p = params; + iarg[0] = p->which; /* cpuwhich_t */ + iarg[1] = p->id; /* id_t */ + uarg[2] = (intptr_t) p->policy; /* const struct vm_domain_policy * */ + *n_args = 3; + break; + } default: *n_args = 0; break; @@ -8923,6 +8941,38 @@ break; }; break; + /* numa_getaffinity */ + case 548: + switch(ndx) { + case 0: + p = "cpuwhich_t"; + break; + case 1: + p = "id_t"; + break; + case 2: + p = "struct vm_domain_policy *"; + break; + default: + break; + }; + break; + /* numa_setaffinity */ + case 549: + switch(ndx) { + case 0: + p = "cpuwhich_t"; + break; + case 1: + p = "id_t"; + break; + case 2: + p = "const struct vm_domain_policy *"; + break; + default: + break; + }; + break; default: break; }; @@ -10811,6 +10861,16 @@ if (ndx == 0 || ndx == 1) p = "int"; break; + /* numa_getaffinity */ + case 548: + if (ndx == 0 || ndx == 1) + p = "int"; + break; + /* numa_setaffinity */ + case 549: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; Index: sys/compat/freebsd32/syscalls.master =================================================================== --- sys/compat/freebsd32/syscalls.master +++ sys/compat/freebsd32/syscalls.master @@ -1074,3 +1074,9 @@ 547 AUE_FUTIMESAT STD { int freebsd32_utimensat(int fd, \ char *path, \ struct timespec *times, int flag); } +548 AUE_NULL NOPROTO { int numa_getaffinity(cpuwhich_t which, \ + id_t id, \ + struct vm_domain_policy *policy); } +549 AUE_NULL NOPROTO { int numa_setaffinity(cpuwhich_t which, \ + id_t id, \ + const struct vm_domain_policy *policy); } Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -2984,6 +2984,7 @@ kern/kern_mtxpool.c standard kern/kern_mutex.c standard kern/kern_ntptime.c standard +kern/kern_numa.c standard kern/kern_osd.c standard kern/kern_physio.c standard kern/kern_pmc.c standard @@ -4012,6 +4013,7 @@ vm/vm_phys.c standard vm/vm_radix.c standard vm/vm_reserv.c standard +vm/vm_domain.c standard vm/vm_unix.c standard vm/vm_zeroidle.c standard vm/vnode_pager.c standard Index: sys/dev/sound/midi/sequencer.c =================================================================== --- sys/dev/sound/midi/sequencer.c +++ sys/dev/sound/midi/sequencer.c @@ -101,7 +101,7 @@ static d_open_t seq_open; static d_close_t seq_close; static d_ioctl_t seq_ioctl; -static d_read_t seq_read; +static d_read_t mseq_read; static d_write_t seq_write; static d_poll_t seq_poll; @@ -109,7 +109,7 @@ .d_version = D_VERSION, .d_open = seq_open, .d_close = seq_close, - .d_read = seq_read, + .d_read = mseq_read, .d_write = seq_write, .d_ioctl = seq_ioctl, .d_poll = seq_poll, @@ -858,7 +858,7 @@ } int -seq_read(struct cdev *i_dev, struct uio *uio, int ioflag) +mseq_read(struct cdev *i_dev, struct uio *uio, int ioflag) { int retval, used; struct seq_softc *scp = i_dev->si_drv1; @@ -869,12 +869,12 @@ if (scp == NULL) return ENXIO; - SEQ_DEBUG(7, printf("seq_read: unit %d, resid %zd.\n", + SEQ_DEBUG(7, printf("mseq_read: unit %d, resid %zd.\n", scp->unit, uio->uio_resid)); mtx_lock(&scp->seq_lock); if ((scp->fflags & FREAD) == 0) { - SEQ_DEBUG(2, printf("seq_read: unit %d is not for reading.\n", + SEQ_DEBUG(2, printf("mseq_read: unit %d is not for reading.\n", scp->unit)); retval = EIO; goto err1; @@ -927,7 +927,7 @@ retval = 0; err1: mtx_unlock(&scp->seq_lock); - SEQ_DEBUG(6, printf("seq_read: ret %d, resid %zd.\n", + SEQ_DEBUG(6, printf("mseq_read: ret %d, resid %zd.\n", retval, uio->uio_resid)); return retval; Index: sys/kern/init_main.c =================================================================== --- sys/kern/init_main.c +++ sys/kern/init_main.c @@ -87,6 +87,7 @@ #include #include #include +#include #include #include @@ -496,6 +497,12 @@ td->td_flags = TDF_INMEM; td->td_pflags = TDP_KTHREAD; td->td_cpuset = cpuset_thread0(); + vm_domain_policy_init(&td->td_vm_dom_policy); + vm_domain_policy_set(&td->td_vm_dom_policy, + VM_POLICY_NONE, -1); + vm_domain_policy_init(&p->p_vm_dom_policy); + vm_domain_policy_set(&p->p_vm_dom_policy, + VM_POLICY_NONE, -1); prison0_init(); p->p_peers = 0; p->p_leader = p; Index: sys/kern/init_sysent.c =================================================================== --- sys/kern/init_sysent.c +++ sys/kern/init_sysent.c @@ -588,4 +588,6 @@ { AS(ppoll_args), (sy_call_t *)sys_ppoll, AUE_POLL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 545 = ppoll */ { AS(futimens_args), (sy_call_t *)sys_futimens, AUE_FUTIMES, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 546 = futimens */ { AS(utimensat_args), (sy_call_t *)sys_utimensat, AUE_FUTIMESAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 547 = utimensat */ + { AS(numa_getaffinity_args), (sy_call_t *)sys_numa_getaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 548 = numa_getaffinity */ + { AS(numa_setaffinity_args), (sy_call_t *)sys_numa_setaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 549 = numa_setaffinity */ }; Index: sys/kern/kern_cpuset.c =================================================================== --- sys/kern/kern_cpuset.c +++ sys/kern/kern_cpuset.c @@ -400,7 +400,7 @@ * -1 may be supplied for any argument to mean the current proc/thread or * the base set of the current thread. May fail with ESRCH/EPERM. */ -static int +int cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp, struct cpuset **setp) { Index: sys/kern/kern_exit.c =================================================================== --- sys/kern/kern_exit.c +++ sys/kern/kern_exit.c @@ -86,6 +86,7 @@ #include #include #include +#include #ifdef KDTRACE_HOOKS #include @@ -951,6 +952,11 @@ #ifdef MAC mac_proc_destroy(p); #endif + /* + * Free any domain policy that's still hiding around. + */ + vm_domain_policy_cleanup(&p->p_vm_dom_policy); + KASSERT(FIRST_THREAD_IN_PROC(p), ("proc_reap: no residual thread!")); uma_zfree(proc_zone, p); Index: sys/kern/kern_fork.c =================================================================== --- sys/kern/kern_fork.c +++ sys/kern/kern_fork.c @@ -80,6 +80,7 @@ #include #include #include +#include #ifdef KDTRACE_HOOKS #include @@ -405,6 +406,7 @@ bcopy(&p1->p_startcopy, &p2->p_startcopy, __rangeof(struct proc, p_startcopy, p_endcopy)); pargs_hold(p2->p_args); + PROC_UNLOCK(p1); bzero(&p2->p_startzero, @@ -498,6 +500,18 @@ startprofclock(p2); td2->td_ucred = crhold(p2->p_ucred); + /* + * Whilst the proc lock is held, copy the VM domain data out + * using the VM domain method. + */ + vm_domain_policy_init(&p2->p_vm_dom_policy); + if (vm_domain_policy_localcopy(&p2->p_vm_dom_policy, + &p1->p_vm_dom_policy) != 0) { + /* failed; fall through to none */ + vm_domain_policy_set(&p2->p_vm_dom_policy, + VM_POLICY_NONE, -1); + } + if (flags & RFSIGSHARE) { p2->p_sigacts = sigacts_hold(p1->p_sigacts); } else { Index: sys/kern/kern_numa.c =================================================================== --- /dev/null +++ sys/kern/kern_numa.c @@ -0,0 +1,174 @@ +/*- + * Copyright (c) 2015, Adrian Chadd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +int +sys_numa_setaffinity(struct thread *td, struct numa_setaffinity_args *uap) +{ + int error; + struct vm_domain_policy vp; + struct thread *ttd; + struct proc *p; + struct cpuset *set = NULL; + + error = copyin(uap->policy, &vp, sizeof(vp)); + if (error) + goto out; + + /* + * Ensure the seq number is zero - otherwise seq.h + * may get very confused. + */ + vp.seq = 0; + + /* + * Validate policy. + */ + if (vm_domain_policy_validate(&vp) != 0) { + error = EINVAL; + goto out; + } + + /* + * Go find the desired proc/tid for this operation. + */ + error = cpuset_which(uap->which, uap->id, &p, + &ttd, &set); + if (error) + goto out; + + /* Only handle CPU_WHICH_TID and CPU_WHICH_PID */ + /* + * XXX if cpuset_which is called with WHICH_CPUSET and NULL cpuset, + * it'll return ESRCH. We should just return EINVAL. + */ + switch (uap->which) { + case CPU_WHICH_TID: + if (vm_domain_policy_copy(&ttd->td_vm_dom_policy, + &vp) != 0) { + error = EINVAL; /* XXX */ + goto out2; + } + break; + case CPU_WHICH_PID: + if (vm_domain_policy_copy(&p->p_vm_dom_policy, + &vp) != 0) { + error = EINVAL; /* XXX */ + goto out2; + } + break; + default: + error = EINVAL; + goto out2; + } + +out2: + PROC_UNLOCK(p); +out: + if (set) + cpuset_rel(set); + return (error); +} + +int +sys_numa_getaffinity(struct thread *td, struct numa_getaffinity_args *uap) +{ + int error; + struct vm_domain_policy vp; + struct thread *ttd; + struct proc *p; + struct cpuset *set = NULL; + + error = cpuset_which(uap->which, uap->id, &p, + &ttd, &set); + if (error) + goto out; + + /* Only handle CPU_WHICH_TID and CPU_WHICH_PID */ + /* + * XXX if cpuset_which is called with WHICH_CPUSET and NULL cpuset, + * it'll return ESRCH. We should just return EINVAL. + */ + switch (uap->which) { + case CPU_WHICH_TID: + if (vm_domain_policy_localcopy(&vp, + &ttd->td_vm_dom_policy) != 0) { + error = EINVAL; /* XXX */ + goto out2; + } + break; + case CPU_WHICH_PID: + if (vm_domain_policy_localcopy(&vp, + &p->p_vm_dom_policy) != 0) { + error = EINVAL; /* XXX */ + goto out2; + } + break; + default: + error = EINVAL; + goto out2; + } + + error = copyout(&vp, uap->policy, sizeof(vp)); +out2: + PROC_UNLOCK(p); +out: + if (set) + cpuset_rel(set); + return (error); +} Index: sys/kern/kern_thr.c =================================================================== --- sys/kern/kern_thr.c +++ sys/kern/kern_thr.c @@ -54,6 +54,8 @@ #include #include +#include + #include #include @@ -254,6 +256,17 @@ thread_unlock(td); if (P_SHOULDSTOP(p)) newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; + + /* + * Copy the existing thread VM policy into the new thread. + */ + if (vm_domain_policy_localcopy(&newtd->td_vm_dom_policy, + &td->td_vm_dom_policy) != 0) { + /* failed; fall through to none */ + vm_domain_policy_set(&newtd->td_vm_dom_policy, + VM_POLICY_NONE, -1); + } + PROC_UNLOCK(p); tidhash_add(newtd); Index: sys/kern/kern_thread.c =================================================================== --- sys/kern/kern_thread.c +++ sys/kern/kern_thread.c @@ -60,6 +60,7 @@ #include #include #include +#include #include SDT_PROVIDER_DECLARE(proc); @@ -352,6 +353,7 @@ return (NULL); } cpu_thread_alloc(td); + vm_domain_policy_init(&td->td_vm_dom_policy); return (td); } @@ -381,6 +383,7 @@ cpu_thread_free(td); if (td->td_kstack != 0) vm_thread_dispose(td); + vm_domain_policy_cleanup(&td->td_vm_dom_policy); uma_zfree(thread_zone, td); } Index: sys/kern/syscalls.c =================================================================== --- sys/kern/syscalls.c +++ sys/kern/syscalls.c @@ -555,4 +555,6 @@ "ppoll", /* 545 = ppoll */ "futimens", /* 546 = futimens */ "utimensat", /* 547 = utimensat */ + "numa_getaffinity", /* 548 = numa_getaffinity */ + "numa_setaffinity", /* 549 = numa_setaffinity */ }; Index: sys/kern/syscalls.master =================================================================== --- sys/kern/syscalls.master +++ sys/kern/syscalls.master @@ -988,5 +988,12 @@ 547 AUE_FUTIMESAT STD { int utimensat(int fd, \ char *path, \ struct timespec *times, int flag); } +548 AUE_NULL STD { int numa_getaffinity(cpuwhich_t which, \ + id_t id, \ + struct vm_domain_policy *policy); } +549 AUE_NULL STD { int numa_setaffinity(cpuwhich_t which, \ + id_t id, \ + const struct vm_domain_policy *policy); } + ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master Index: sys/kern/systrace_args.c =================================================================== --- sys/kern/systrace_args.c +++ sys/kern/systrace_args.c @@ -3337,6 +3337,24 @@ *n_args = 4; break; } + /* numa_getaffinity */ + case 548: { + struct numa_getaffinity_args *p = params; + iarg[0] = p->which; /* cpuwhich_t */ + iarg[1] = p->id; /* id_t */ + uarg[2] = (intptr_t) p->policy; /* struct vm_domain_policy * */ + *n_args = 3; + break; + } + /* numa_setaffinity */ + case 549: { + struct numa_setaffinity_args *p = params; + iarg[0] = p->which; /* cpuwhich_t */ + iarg[1] = p->id; /* id_t */ + uarg[2] = (intptr_t) p->policy; /* const struct vm_domain_policy * */ + *n_args = 3; + break; + } default: *n_args = 0; break; @@ -8883,6 +8901,38 @@ break; }; break; + /* numa_getaffinity */ + case 548: + switch(ndx) { + case 0: + p = "cpuwhich_t"; + break; + case 1: + p = "id_t"; + break; + case 2: + p = "struct vm_domain_policy *"; + break; + default: + break; + }; + break; + /* numa_setaffinity */ + case 549: + switch(ndx) { + case 0: + p = "cpuwhich_t"; + break; + case 1: + p = "id_t"; + break; + case 2: + p = "const struct vm_domain_policy *"; + break; + default: + break; + }; + break; default: break; }; @@ -10806,6 +10856,16 @@ if (ndx == 0 || ndx == 1) p = "int"; break; + /* numa_getaffinity */ + case 548: + if (ndx == 0 || ndx == 1) + p = "int"; + break; + /* numa_setaffinity */ + case 549: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; Index: sys/sys/_vm_domain.h =================================================================== --- /dev/null +++ sys/sys/_vm_domain.h @@ -0,0 +1,63 @@ +/*- + * Copyright (c) 2015 Adrian Chadd . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any + * redistribution must be conditioned upon including a substantially + * similar Disclaimer requirement for further binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGES. + * + * $FreeBSD$ + */ +#ifndef ___PRIV_VM_DOMAIN_H___ +#define ___PRIV_VM_DOMAIN_H___ + +#include + +typedef enum { + VM_POLICY_NONE, + VM_POLICY_ROUND_ROBIN, + VM_POLICY_FIXED_DOMAIN, + VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN, + VM_POLICY_FIRST_TOUCH, + VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, + VM_POLICY_MAX +} vm_domain_policy_type_t; + +struct vm_domain_policy { + seq_t seq; + vm_domain_policy_type_t policy; + int domain; +}; + +struct vm_domain_iterator { + vm_domain_policy_type_t policy; + int domain; + int n; +}; + +#define VM_DOMAIN_POLICY_STATIC_INITIALISER(vt, vd) \ + { .policy = vt, \ + .seq = 0, \ + .domain = vd } + +#endif /* ___PRIV_VM_DOMAIN_H___ */ Index: sys/sys/cpuset.h =================================================================== --- sys/sys/cpuset.h +++ sys/sys/cpuset.h @@ -124,6 +124,9 @@ int cpuset_setithread(lwpid_t id, int cpu); int cpuset_create_root(struct prison *, struct cpuset **); int cpuset_setproc_update_set(struct proc *, struct cpuset *); +int cpuset_which(cpuwhich_t, id_t, struct proc **, + struct thread **, struct cpuset **); + char *cpusetobj_strprint(char *, const cpuset_t *); int cpusetobj_strscan(cpuset_t *, const char *); #ifdef DDB Index: sys/sys/numa.h =================================================================== --- /dev/null +++ sys/sys/numa.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2015 Adrian Chadd . + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Michael Fischbein. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +#ifndef __SYS_NUMA_H__ +#define __SYS_NUMA_H__ + +#include + +extern int numa_setaffinity(cpuwhich_t which, id_t id, + struct vm_domain_policy *vd); +extern int numa_getaffinity(cpuwhich_t which, id_t id, + struct vm_domain_policy *vd); + +#endif /* __SYS_NUMA_H__ */ Index: sys/sys/proc.h =================================================================== --- sys/sys/proc.h +++ sys/sys/proc.h @@ -63,6 +63,7 @@ #endif #include #include +#include #include /* Machine-dependent proc substruct. */ /* @@ -217,6 +218,7 @@ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ + struct vm_domain_policy td_vm_dom_policy; /* (c) current numa domain policy */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ #define td_siglist td_sigqueue.sq_signals @@ -603,6 +605,7 @@ uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ u_char p_throttled; /* (c) Flag for racct pcpu throttling */ + struct vm_domain_policy p_vm_dom_policy; /* (c) process default VM domain, or -1 */ /* * An orphan is the child that has beed re-parented to the * debugger as a result of attaching to it. Need to keep Index: sys/sys/syscall.h =================================================================== --- sys/sys/syscall.h +++ sys/sys/syscall.h @@ -465,4 +465,6 @@ #define SYS_ppoll 545 #define SYS_futimens 546 #define SYS_utimensat 547 -#define SYS_MAXSYSCALL 548 +#define SYS_numa_getaffinity 548 +#define SYS_numa_setaffinity 549 +#define SYS_MAXSYSCALL 550 Index: sys/sys/syscall.mk =================================================================== --- sys/sys/syscall.mk +++ sys/sys/syscall.mk @@ -412,4 +412,6 @@ procctl.o \ ppoll.o \ futimens.o \ - utimensat.o + utimensat.o \ + numa_getaffinity.o \ + numa_setaffinity.o Index: sys/sys/sysproto.h =================================================================== --- sys/sys/sysproto.h +++ sys/sys/sysproto.h @@ -1790,6 +1790,16 @@ char times_l_[PADL_(struct timespec *)]; struct timespec * times; char times_r_[PADR_(struct timespec *)]; char flag_l_[PADL_(int)]; int flag; char flag_r_[PADR_(int)]; }; +struct numa_getaffinity_args { + char which_l_[PADL_(cpuwhich_t)]; cpuwhich_t which; char which_r_[PADR_(cpuwhich_t)]; + char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)]; + char policy_l_[PADL_(struct vm_domain_policy *)]; struct vm_domain_policy * policy; char policy_r_[PADR_(struct vm_domain_policy *)]; +}; +struct numa_setaffinity_args { + char which_l_[PADL_(cpuwhich_t)]; cpuwhich_t which; char which_r_[PADR_(cpuwhich_t)]; + char id_l_[PADL_(id_t)]; id_t id; char id_r_[PADR_(id_t)]; + char policy_l_[PADL_(const struct vm_domain_policy *)]; const struct vm_domain_policy * policy; char policy_r_[PADR_(const struct vm_domain_policy *)]; +}; int nosys(struct thread *, struct nosys_args *); void sys_sys_exit(struct thread *, struct sys_exit_args *); int sys_fork(struct thread *, struct fork_args *); @@ -2178,6 +2188,8 @@ int sys_ppoll(struct thread *, struct ppoll_args *); int sys_futimens(struct thread *, struct futimens_args *); int sys_utimensat(struct thread *, struct utimensat_args *); +int sys_numa_getaffinity(struct thread *, struct numa_getaffinity_args *); +int sys_numa_setaffinity(struct thread *, struct numa_setaffinity_args *); #ifdef COMPAT_43 @@ -2931,6 +2943,8 @@ #define SYS_AUE_ppoll AUE_POLL #define SYS_AUE_futimens AUE_FUTIMES #define SYS_AUE_utimensat AUE_FUTIMESAT +#define SYS_AUE_numa_getaffinity AUE_NULL +#define SYS_AUE_numa_setaffinity AUE_NULL #undef PAD_ #undef PADL_ Index: sys/vm/vm_domain.h =================================================================== --- /dev/null +++ sys/vm/vm_domain.h @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 2015 Adrian Chadd . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any + * redistribution must be conditioned upon including a substantially + * similar Disclaimer requirement for further binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGES. + * + * $FreeBSD$ + */ +#ifndef __VM_DOMAIN_H__ +#define __VM_DOMAIN_H__ + +#include + +/* + * TODO: check to see if these should just become inline functions + * at some point. + */ +extern int vm_domain_policy_init(struct vm_domain_policy *vp); +extern int vm_domain_policy_set(struct vm_domain_policy *vp, + vm_domain_policy_type_t vt, int domain); +extern int vm_domain_policy_cleanup(struct vm_domain_policy *vp); +extern int vm_domain_policy_localcopy(struct vm_domain_policy *dst, + const struct vm_domain_policy *src); +extern int vm_domain_policy_copy(struct vm_domain_policy *dst, + const struct vm_domain_policy *src); +extern int vm_domain_policy_validate(const struct vm_domain_policy *vp); + +extern int vm_domain_iterator_init(struct vm_domain_iterator *vi); +extern int vm_domain_iterator_set(struct vm_domain_iterator *vi, + vm_domain_policy_type_t vt, int domain); +extern int vm_domain_iterator_set_policy(struct vm_domain_iterator *vi, + const struct vm_domain_policy *vt); +extern int vm_domain_iterator_run(struct vm_domain_iterator *vi, + int *domain); +extern int vm_domain_iterator_isdone(struct vm_domain_iterator *vi); +extern int vm_domain_iterator_cleanup(struct vm_domain_iterator *vi); + +#endif /* __VM_DOMAIN_H__ */ Index: sys/vm/vm_domain.c =================================================================== --- /dev/null +++ sys/vm/vm_domain.c @@ -0,0 +1,399 @@ +/*- + * Copyright (c) 2015 Adrian Chadd . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any + * redistribution must be conditioned upon including a substantially + * similar Disclaimer requirement for further binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGES. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_vm.h" +#include "opt_ddb.h" + +#include +#include +#include +#include +#include +#include +#if MAXMEMDOM > 1 +#include +#endif +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +static __inline int +vm_domain_rr_selectdomain(void) +{ +#if MAXMEMDOM > 1 + struct thread *td; + + td = curthread; + + td->td_dom_rr_idx++; + td->td_dom_rr_idx %= vm_ndomains; + return (td->td_dom_rr_idx); +#else + return (0); +#endif +} + +/* + * This implements a very simple set of VM domain memory allocation + * policies and iterators. + */ + +/* + * A VM domain policy represents a desired VM domain policy. + * Iterators implement searching through VM domains in a specific + * order. + */ + +/* + * When setting a policy, the caller must establish their own + * exclusive write protection for the contents of the domain + * policy. + */ +int +vm_domain_policy_init(struct vm_domain_policy *vp) +{ + + bzero(vp, sizeof(*vp)); + vp->policy = VM_POLICY_NONE; + vp->domain = -1; + return (0); +} + +int +vm_domain_policy_set(struct vm_domain_policy *vp, + vm_domain_policy_type_t vt, int domain) +{ + + seq_write_begin(&vp->seq); + vp->policy = vt; + vp->domain = domain; + seq_write_end(&vp->seq); + return (0); +} + +/* + * Take a local copy of a policy. + * + * The destination policy isn't write-barriered; this is used + * for doing local copies into something that isn't shared. + * + * Returns 0 + the policy copied if OK. + * Returns -1 if there was a problem. + */ +int +vm_domain_policy_localcopy(struct vm_domain_policy *dst, + const struct vm_domain_policy *src) +{ + int i; + seq_t seq; + + for (i = 0; i < 10; i++) { + seq = seq_read(&src->seq); + *dst = *src; + if (seq_consistent(&src->seq, seq)) + return (0); + cpu_spinwait(); + } + + return (-1); +} + +/* + * Take a write-barrier copy of a policy. + * + * The destination policy iswrite -barriered; this is used + * for doing copies into policies that may be read by other + * threads. + * + * Returns 0 + the policy copied if OK. + * Returns -1 if there was a problem. + */ +int +vm_domain_policy_copy(struct vm_domain_policy *dst, + const struct vm_domain_policy *src) +{ + int i; + seq_t seq; + struct vm_domain_policy d; + + for (i = 0; i < 10; i++) { + seq = seq_read(&src->seq); + d = *src; + if (seq_consistent(&src->seq, seq)) { + seq_write_begin(&dst->seq); + dst->domain = d.domain; + dst->policy = d.policy; + seq_write_end(&dst->seq); + return (0); + } + cpu_spinwait(); + } + + return (-1); +} + +int +vm_domain_policy_validate(const struct vm_domain_policy *vp) +{ + + switch (vp->policy) { + case VM_POLICY_NONE: + case VM_POLICY_ROUND_ROBIN: + case VM_POLICY_FIRST_TOUCH: + case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: + if (vp->domain == -1) + return (0); + return (-1); + case VM_POLICY_FIXED_DOMAIN: + case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: + if (vp->domain >= 0 & vp->domain < vm_ndomains) + return (0); + return (-1); + default: + return (-1); + } + return (-1); +} + +int +vm_domain_policy_cleanup(struct vm_domain_policy *vp) +{ + + /* For now, empty */ + return (0); +} + +int +vm_domain_iterator_init(struct vm_domain_iterator *vi) +{ + + /* Nothing to do for now */ + return (0); +} + +/* + * Manually setup an iterator with the given details. + */ +int +vm_domain_iterator_set(struct vm_domain_iterator *vi, + vm_domain_policy_type_t vt, int domain) +{ + + switch (vt) { + case VM_POLICY_FIXED_DOMAIN: + vi->policy = VM_POLICY_FIXED_DOMAIN; + vi->domain = domain; + vi->n = 1; + break; + case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: + vi->policy = VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN; + vi->domain = domain; + vi->n = vm_ndomains; + break; + case VM_POLICY_FIRST_TOUCH: + vi->policy = VM_POLICY_FIRST_TOUCH; + vi->domain = PCPU_GET(domain); + vi->n = 1; + break; + case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: + vi->policy = VM_POLICY_FIRST_TOUCH_ROUND_ROBIN; + vi->domain = PCPU_GET(domain); + vi->n = vm_ndomains; + break; + case VM_POLICY_ROUND_ROBIN: + default: + vi->policy = VM_POLICY_ROUND_ROBIN; + vi->domain = -1; + vi->n = vm_ndomains; + break; + } + return (0); +} + +/* + * Setup an iterator based on the given policy. + */ +static inline int +_vm_domain_iterator_set_policy(struct vm_domain_iterator *vi, + const struct vm_domain_policy *vt) +{ + /* + * Initialise the iterator. + * + * For first-touch, the initial domain is set + * via the current thread CPU domain. + * + * For fixed-domain, it's assumed that the + * caller has initialised the specific domain + * it is after. + */ + switch (vt->policy) { + case VM_POLICY_FIXED_DOMAIN: + vi->policy = vt->policy; + vi->domain = vt->domain; + vi->n = 1; + break; + case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: + vi->policy = vt->policy; + vi->domain = vt->domain; + vi->n = vm_ndomains; + break; + case VM_POLICY_FIRST_TOUCH: + vi->policy = vt->policy; + vi->domain = PCPU_GET(domain); + vi->n = 1; + break; + case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: + vi->policy = vt->policy; + vi->domain = PCPU_GET(domain); + vi->n = vm_ndomains; + break; + case VM_POLICY_ROUND_ROBIN: + default: + /* + * Default to round-robin policy. + */ + vi->policy = VM_POLICY_ROUND_ROBIN; + vi->domain = -1; + vi->n = vm_ndomains; + break; + } + + return (0); +} + +int +vm_domain_iterator_set_policy(struct vm_domain_iterator *vi, + const struct vm_domain_policy *vt) +{ + int count; + seq_t seq; + struct vm_domain_policy vt_lcl; + + for (count = 0; count < 10; count++) { + seq = seq_read(&vt->seq); + vt_lcl = *vt; + if (seq_consistent(&vt->seq, seq)) + break; + cpu_spinwait(); + } + + if (count < 10) + return (_vm_domain_iterator_set_policy(vi, &vt_lcl)); + + /* + * Default to round-robin policy. + */ + vi->policy = VM_POLICY_ROUND_ROBIN; + vi->domain = -1; + vi->n = vm_ndomains; + + return (0); +} + +/* + * Return the next VM domain to use. + * + * Returns 0 w/ domain set to the next domain to use, or + * -1 to indicate no more domains are available. + */ +int +vm_domain_iterator_run(struct vm_domain_iterator *vi, int *domain) +{ + + /* General catch-all */ + if (vi->n <= 0) + return (-1); + + switch (vi->policy) { + case VM_POLICY_FIXED_DOMAIN: + case VM_POLICY_FIRST_TOUCH: + *domain = vi->domain; + vi->n--; + break; + case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: + case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: + /* + * XXX TODO: skip over the rr'ed domain + * if it equals the one we started with. + */ + if (vi->n == vm_ndomains) + *domain = vi->domain; + else + *domain = vm_domain_rr_selectdomain(); + vi->n--; + break; + case VM_POLICY_ROUND_ROBIN: + default: + *domain = vm_domain_rr_selectdomain(); + vi->n--; + break; + } + + return (0); +} + +/* + * Returns 1 if the iteration is done, or 0 if it has not. + + * This can only be called after at least one loop through + * the iterator. Ie, it's designed to be used as a tail + * check of a loop, not the head check of a loop. + */ +int +vm_domain_iterator_isdone(struct vm_domain_iterator *vi) +{ + + return (vi->n <= 0); +} + +int +vm_domain_iterator_cleanup(struct vm_domain_iterator *vi) +{ + + return (0); +} Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c +++ sys/vm/vm_page.c @@ -1625,6 +1625,7 @@ * vm_page_cache(). */ mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE); + m = NULL; if (vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_interrupt_free_min) || @@ -1669,7 +1670,19 @@ } #endif } - } else { + } + + /* + * Can't allocate or attempted to and couldn't allocate a page + * given the current VM policy. Give up. + * + * Note - yes, this is one of the current shortcomings of the + * VM domain design - there's a global set of vm_cnt counters, + * and it's quite possible things will get unhappy with this. + * However without it'll kernel panic below - the code didn't + * check m == NULL here and would continue. + */ + if (m == NULL) { /* * Not allocatable, give up. */ Index: sys/vm/vm_phys.c =================================================================== --- sys/vm/vm_phys.c +++ sys/vm/vm_phys.c @@ -57,6 +57,7 @@ #include #include #include +#include #include @@ -67,6 +68,8 @@ #include #include +#include + _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, "Too many physsegs."); @@ -141,13 +144,30 @@ SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info"); +#if MAXMEMDOM > 1 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info"); +#endif SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, &vm_ndomains, 0, "Number of physical memory domains available."); +/* + * Default to first-touch + round-robin. + */ +static struct mtx vm_default_policy_mtx; +MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex", + MTX_DEF); +#if MAXMEMDOM > 1 +static struct vm_domain_policy vm_default_policy = + VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0); +#else +/* Use round-robin so the domain policy code will only try once per allocation */ +static struct vm_domain_policy vm_default_policy = + VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0); +#endif + static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order); static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); @@ -156,6 +176,60 @@ static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order); +static int +sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS) +{ + char policy_name[32]; + int error; + + mtx_lock(&vm_default_policy_mtx); + + /* Map policy to output string */ + switch (vm_default_policy.policy) { + case VM_POLICY_FIRST_TOUCH: + strcpy(policy_name, "first-touch"); + break; + case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: + strcpy(policy_name, "first-touch-rr"); + break; + case VM_POLICY_ROUND_ROBIN: + default: + strcpy(policy_name, "rr"); + break; + } + mtx_unlock(&vm_default_policy_mtx); + + error = sysctl_handle_string(oidp, &policy_name[0], + sizeof(policy_name), req); + if (error != 0 || req->newptr == NULL) + return (error); + + mtx_lock(&vm_default_policy_mtx); + /* Set: match on the subset of policies that make sense as a default */ + if (strcmp("first-touch-rr", policy_name) == 0) { + vm_domain_policy_set(&vm_default_policy, + VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0); + } else if (strcmp("first-touch", policy_name) == 0) { + vm_domain_policy_set(&vm_default_policy, + VM_POLICY_FIRST_TOUCH, 0); + } else if (strcmp("rr", policy_name) == 0) { + vm_domain_policy_set(&vm_default_policy, + VM_POLICY_ROUND_ROBIN, 0); + } else { + error = EINVAL; + goto finish; + } + + error = 0; +finish: + mtx_unlock(&vm_default_policy_mtx); + return (error); +} + +SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW, + 0, 0, sysctl_vm_default_policy, "A", + "Default policy (rr, first-touch, first-touch-rr"); + /* * Red-black tree helpers for vm fictitious range management. */ @@ -213,6 +287,54 @@ #endif } +/* + * Initialise a VM domain iterator. + * + * Check the thread policy, then the proc policy, + * then default to the system policy. + * + * Later on the various layers will have this logic + * plumbed into them and the phys code will be explicitly + * handed a VM domain policy to use. + */ +static void +vm_policy_iterator_init(struct vm_domain_iterator *vi) +{ +#if MAXMEMDOM > 1 + struct vm_domain_policy lcl; +#endif + + vm_domain_iterator_init(vi); + +#if MAXMEMDOM > 1 + /* Attempt to copy out the thread policy */ + if ((vm_domain_policy_localcopy(&lcl, + &curthread->td_vm_dom_policy) == 0) && + (lcl.policy != VM_POLICY_NONE)) { + /* Thread policy is present; use it */ + vm_domain_iterator_set_policy(vi, &lcl); + } else if ((vm_domain_policy_localcopy(&lcl, + &curthread->td_proc->p_vm_dom_policy) == 0) && + (lcl.policy != VM_POLICY_NONE)) { + /* Process policy is present; use it */ + vm_domain_iterator_set_policy(vi, &lcl); + + } else { + /* Use system default policy */ + vm_domain_iterator_set_policy(vi, &vm_default_policy); + } +#else + vm_domain_iterator_set_policy(vi, &vm_default_policy); +#endif +} + +static void +vm_policy_iterator_finish(struct vm_domain_iterator *vi) +{ + + vm_domain_iterator_cleanup(vi); +} + boolean_t vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high) { @@ -309,13 +431,18 @@ vm_phys_mem_affinity(int f, int t) { +#if MAXMEMDOM > 1 if (mem_locality == NULL) return (-1); if (f >= vm_ndomains || t >= vm_ndomains) return (-1); return (mem_locality[f * vm_ndomains + t]); +#else + return (-1); +#endif } +#if MAXMEMDOM > 1 /* * Outputs the VM locality table. */ @@ -343,6 +470,7 @@ sbuf_delete(&sbuf); return (error); } +#endif static void vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) @@ -634,15 +762,17 @@ vm_phys_alloc_pages(int pool, int order) { vm_page_t m; - int dom, domain, flind; + int domain, flind; + struct vm_domain_iterator vi; KASSERT(pool < VM_NFREEPOOL, ("vm_phys_alloc_pages: pool %d is out of range", pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_alloc_pages: order %d is out of range", order)); - for (dom = 0; dom < vm_ndomains; dom++) { - domain = vm_rr_selectdomain(); + vm_policy_iterator_init(&vi); + + while ((vm_domain_iterator_run(&vi, &domain)) == 0) { for (flind = 0; flind < vm_nfreelists; flind++) { m = vm_phys_alloc_domain_pages(domain, flind, pool, order); @@ -650,6 +780,8 @@ return (m); } } + + vm_policy_iterator_finish(&vi); return (NULL); } @@ -664,7 +796,8 @@ vm_phys_alloc_freelist_pages(int freelist, int pool, int order) { vm_page_t m; - int dom, domain; + struct vm_domain_iterator vi; + int domain; KASSERT(freelist < VM_NFREELIST, ("vm_phys_alloc_freelist_pages: freelist %d is out of range", @@ -673,13 +806,17 @@ ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); - for (dom = 0; dom < vm_ndomains; dom++) { - domain = vm_rr_selectdomain(); + + vm_policy_iterator_init(&vi); + + while ((vm_domain_iterator_run(&vi, &domain)) == 0) { m = vm_phys_alloc_domain_pages(domain, vm_freelist_to_flind[freelist], pool, order); if (m != NULL) return (m); } + + vm_policy_iterator_finish(&vi); return (NULL); } @@ -1169,7 +1306,8 @@ vm_paddr_t pa, pa_last, size; vm_page_t m, m_ret; u_long npages_end; - int dom, domain, flind, oind, order, pind; + int domain, flind, oind, order, pind; + struct vm_domain_iterator vi; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); size = npages << PAGE_SHIFT; @@ -1181,9 +1319,15 @@ ("vm_phys_alloc_contig: boundary must be a power of 2")); /* Compute the queue that is the best fit for npages. */ for (order = 0; (1 << order) < npages; order++); - dom = 0; + + vm_policy_iterator_init(&vi); + restartdom: - domain = vm_rr_selectdomain(); + if (vm_domain_iterator_run(&vi, &domain) != 0) { + vm_policy_iterator_finish(&vi); + return (NULL); + } + for (flind = 0; flind < vm_nfreelists; flind++) { for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { @@ -1241,8 +1385,9 @@ } } } - if (++dom < vm_ndomains) + if (!vm_domain_iterator_isdone(&vi)) goto restartdom; + vm_policy_iterator_finish(&vi); return (NULL); done: for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) { Index: usr.bin/Makefile =================================================================== --- usr.bin/Makefile +++ usr.bin/Makefile @@ -117,6 +117,7 @@ nice \ nl \ ${_nm} \ + numactl \ nohup \ opieinfo \ opiekey \ Index: usr.bin/numactl/Makefile =================================================================== --- /dev/null +++ usr.bin/numactl/Makefile @@ -0,0 +1,5 @@ +# $FreeBSD$ + +PROG= numactl + +.include Index: usr.bin/numactl/numactl.1 =================================================================== --- /dev/null +++ usr.bin/numactl/numactl.1 @@ -0,0 +1,132 @@ +.\" Copyright (c) 2015 Adrian Chadd +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd May 9, 2015 +.Dt NUMACTL 1 +.Os +.Sh NAME +.Nm numactl +.Nd "manage NUMA policy configuration" +.Sh SYNOPSIS +.Nm +.Op Fl -mempolicy Ar policy +.Op Fl -memdomain Ar domain +.Op Fl -cpudomain Ar domain +.Ar cmd ... +.Nm +.Fl -get +.Op Fl -tid Ar tid +.Op Fl -pid Ar tid +.Nm +.Fl -set +.Op Fl -mempolicy Ar policy +.Op Fl -memdomain Ar domain +.Op Fl -cpudomain Ar domain +.Op Fl -tid Ar tid +.Op Fl -pid Ar tid +.Sh DESCRIPTION +The +.Nm +command can be used to assign NUMA policies to processes/threads, +run commands with a given NUMA policy, and query information +about NUMA policies on running processes. +.Pp +.Nm +requires a target to modify or query. +The target may be specified as a command, process id or a thread id. +Using +.Fl -get +the target's NUMA policy may be queried. +Using +.Fl -set +the target's NUMA policy may be queried. +If no target is specified, +.Nm +operates on itself. +Not all combinations of operations and targets are supported. +For example, +you may not set the id of an existing set or query and launch a command +at the same time. +.Pp +Each process and thread has a NUMA policy. +By default the policy is NONE. +If a thread policy is NONE, then the policy will fall back to the process. +If the process policy is NONE, then the policy will fall back to the +system default. +The policy may be queried by using +.Fl -get. +.Pp +The options are as follows: +.Bl -tag -width ".Fl -cpudomain Ar domain" +.It Fl -cpudomain Ar domain +Set the given CPU scheduling policy. +Constrain the the object (tid, pid, command) to run on CPUs +that belong to the given domain. +.It Fl -get +Retrieve the NUMA policy for the given thread or process id. +.It Fl -set +Set the NUMA policy for the given thread or process id. +.It Fl -memdomain Ar domain +Constrain the object (tid, pid, command) to the given +domain. +This is only valid for fixed-domain and fixed-domain-rr. +It must not be set for other policies. +.It Fl -mempolicy Ar policy +Set the given memory allocation policy. +Valid policies are none, rr, fixed-domain, fixed-domain-rr, +first-touch, and first-touch-rr. +A memdomain argument is required for fixed-domain and +fixed-domain-rr. +.It Fl -pid Ar pid +Operate on the given pid. +.It Fl -tid Ar tid +Operate on the given tid. +.El +.Sh EXIT STATUS +.Ex -std +.Sh EXAMPLES +Create a +.Pa /bin/sh +process with memory coming from domain 0, but +CPUs coming from domain 1: +.Dl numactl --mempolicy=fixed-domain --memdomain=0 --cpudomain=1 /bin/sh +.Pp +Query the NUMA policy for the +.Aq sh pid : +.Dl numactl --get --pid= +.Pp +Set the NUMA policy for the given TID to round-robin: +.Dl numactl --set --mempolicy=rr --tid= +.Sh SEE ALSO +.Xr cpuset 2 , +.Xr numa 4 +.Sh HISTORY +The +.Nm +command first appeared in +.Fx 11.0 . +.Sh AUTHORS +.An Adrian Chadd Aq Mt adrian@FreeBSD.org Index: usr.bin/numactl/numactl.c =================================================================== --- /dev/null +++ usr.bin/numactl/numactl.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2015 Adrian Chadd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +enum { + OPT_TID = 1001, + OPT_PID, + OPT_MEMDOMAIN, + OPT_CPUDOMAIN, + OPT_MEMPOLICY, + OPT_SET, + OPT_GET +}; + +static struct option longopts[] = { + { "tid", required_argument, NULL, OPT_TID }, + { "pid", required_argument, NULL, OPT_PID }, + { "memdomain", required_argument, NULL, OPT_MEMDOMAIN }, + { "cpudomain", required_argument, NULL, OPT_CPUDOMAIN }, + { "mempolicy", required_argument, NULL, OPT_MEMPOLICY }, + { "set", no_argument, NULL, OPT_SET }, + { "get", no_argument, NULL, OPT_GET }, + { NULL, 0, NULL, 0 } +}; + +static const char * +policy_to_str(vm_domain_policy_type_t vt) +{ + switch (vt) { + case VM_POLICY_NONE: + return ("none"); + case VM_POLICY_ROUND_ROBIN: + return ("rr"); + case VM_POLICY_FIXED_DOMAIN: + return ("fixed-domain"); + case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN: + return ("fixed-domain-rr"); + case VM_POLICY_FIRST_TOUCH: + return ("first-touch"); + case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: + return ("first-touch-rr"); + default: + return ("unknown"); + } +} + +static int +parse_policy(struct vm_domain_policy *vd, const char *str) +{ + + if (strcmp(str, "rr") == 0) { + vd->policy = VM_POLICY_ROUND_ROBIN; + vd->domain = -1; + vd->seq = 0; + return (0); + } + + if (strcmp(str, "first-touch-rr") == 0) { + vd->policy = VM_POLICY_FIRST_TOUCH_ROUND_ROBIN; + vd->domain = -1; + vd->seq = 0; + return (0); + } + + if (strcmp(str, "first-touch") == 0) { + vd->policy = VM_POLICY_FIRST_TOUCH; + vd->domain = -1; + vd->seq = 0; + return (0); + } + + if (strcmp(str, "fixed-domain") == 0) { + vd->policy = VM_POLICY_FIXED_DOMAIN; + vd->domain = 0; + vd->seq = 0; + return (0); + } + + if (strcmp(str, "fixed-domain-rr") == 0) { + vd->policy = VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN; + vd->domain = 0; + vd->seq = 0; + return (0); + } + + return (-1); +} + +static void +usage(void) +{ + + printf("usage: numactl --get [--tid=] [--pid=]\n"); + printf(" numactl --set [--tid=] [--pid=]\n"); + printf(" [--mempolicy=] [--memdomain=]\n"); + printf(" [--cpudomain=]\n"); + printf(" numactl [--mempolicy=] [--memdomain=]\n"); + printf(" [--cpudomain=] ...\n"); + + exit(EX_USAGE); +} + +static int +set_numa_domain_cpuaffinity(int cpu_domain) +{ + int error; + cpuset_t set; + error = cpuset_getaffinity(CPU_LEVEL_WHICH, + CPU_WHICH_DOMAIN, + cpu_domain, + sizeof(set), + &set); + if (error != 0) + err(EXIT_FAILURE, "cpuset_getaffinity"); + error = cpuset_setaffinity(CPU_LEVEL_WHICH, + CPU_WHICH_PID, + -1, + sizeof(set), + &set); + if (error != 0) + err(EXIT_FAILURE, "cpuset_setaffinity"); + + return (0); +} + +int +main(int argc, char *argv[]) +{ + id_t id; + cpuwhich_t which; + struct vm_domain_policy vd; + int error; + int is_set = 0, is_get = 0; + int mem_policy_set = 0; + lwpid_t tid = -1; + pid_t pid = -1; + int ch; + int cpu_domain = -1; + + while ((ch = getopt_long(argc, argv, "", longopts, NULL)) != -1) { + switch (ch) { + case OPT_TID: + tid = atoi(optarg); + break; + case OPT_PID: + pid = atoi(optarg); + break; + case OPT_CPUDOMAIN: + cpu_domain = atoi(optarg); + break; + case OPT_MEMDOMAIN: + if (mem_policy_set == 0) { + fprintf(stderr, "Error: set policy first before domain\n"); + exit(EXIT_FAILURE); + } + vd.domain = atoi(optarg); + break; + case OPT_MEMPOLICY: + if (parse_policy(&vd, optarg) != 0) { + fprintf(stderr, "Could not parse policy: '%s'\n", optarg); + exit(EXIT_FAILURE); + } + mem_policy_set = 1; + break; + case OPT_SET: + is_set = 1; + break; + case OPT_GET: + is_get = 1; + break; + default: + usage(); + } + } + argc -= optind; + argv += optind; + + /* Handle the user wishing to run a command */ + if (argc) { + /* Ensure that a policy was set */ + if (mem_policy_set == 0) { + fprintf(stderr, "Error: no policy given\n"); + usage(); + } + + /* Set current memory process policy, will be inherited */ + if (numa_setaffinity(CPU_WHICH_PID, -1, &vd) != 0) + err(EXIT_FAILURE, "numa_setaffinity"); + + /* If a CPU domain policy was given, include that too */ + if (cpu_domain != -1) + (void) set_numa_domain_cpuaffinity(cpu_domain); + + errno = 0; + execvp(*argv, argv); + err(errno == ENOENT ? 127 : 126, "%s", *argv); + } + + /* Figure out which */ + if (tid != -1) { + which = CPU_WHICH_TID; + id = tid; + } else if (pid != -1) { + which = CPU_WHICH_PID; + id = pid; + } + + /* Sanity checks */ + if (is_set && is_get) { + fprintf(stderr, "Error: can't set both 'set' and 'get'\n"); + usage(); + } + + if (is_set && ! mem_policy_set) { + fprintf(stderr, "Error: --set given, but no policy\n"); + usage(); + } + + if (tid == -1 && pid == -1) { + fprintf(stderr, "Error: no TID or PID given\n"); + usage(); + } + + /* If it's get, then get the policy and return */ + if (is_get) { + error = numa_getaffinity(which, id, &vd); + if (error != 0) + err(EXIT_FAILURE, "numa_getaffinity"); + printf(" Policy: %s; domain: %d\n", + policy_to_str(vd.policy), + vd.domain); + exit(EXIT_SUCCESS); + } + + /* Assume it's set */ + + /* Syscall */ + error = numa_setaffinity(which, id, &vd); + if (error != 0) + err(EXIT_FAILURE, "numa_setaffinity"); + + /* If a CPU domain policy was given, include that too */ + if (cpu_domain != -1) + (void) set_numa_domain_cpuaffinity(cpu_domain); + + exit(EXIT_SUCCESS); +}