Index: kern/kern_cpuset.c =================================================================== --- kern/kern_cpuset.c +++ kern/kern_cpuset.c @@ -64,6 +64,9 @@ #include #include +#include +#include +#include #include #include @@ -2005,6 +2008,59 @@ return (error); } +static int +domainset_copyin(struct domainset *domain, size_t domainsetsize, + const domainset_t *maskp, int policy) +{ + domainset_t *mask; + int error; + + if (domainsetsize < sizeof(domainset_t) || + domainsetsize > DOMAINSET_MAXSIZE / NBBY) + return (ERANGE); + + if (policy <= DOMAINSET_POLICY_INVALID || + policy > DOMAINSET_POLICY_MAX) + return (EINVAL); + + memset(domain, 0, sizeof(*domain)); + mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO); + error = copyin(maskp, mask, domainsetsize); + if (error) + goto out; + /* + * Verify that no high bits are set. + */ + if (domainsetsize > sizeof(domainset_t)) { + char *end; + char *cp; + + end = cp = (char *)&mask->__bits; + end += domainsetsize; + cp += sizeof(domainset_t); + while (cp != end) + if (*cp++ != 0) { + error = EINVAL; + goto out; + } + + } + DOMAINSET_COPY(mask, &domain->ds_mask); + domain->ds_policy = policy; + /* Translate preferred policy into a mask and fallback. */ + if (policy == DOMAINSET_POLICY_PREFER) { + /* Only support a single preferred domain. */ + if (DOMAINSET_COUNT(&domain->ds_mask) != 1) { + error = EINVAL; + goto out; + } + domain->ds_prefer = DOMAINSET_FFS(&domain->ds_mask) - 1; + } +out: + free(mask, M_TEMP); + return (error); +} + #ifndef _SYS_SYSPROTO_H_ struct cpuset_setdomain_args { cpulevel_t level; @@ -2015,6 +2071,7 @@ int policy; }; #endif + int sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap) { @@ -2032,12 +2089,8 @@ struct thread *ttd; struct proc *p; struct domainset domain; - domainset_t *mask; int error; - if (domainsetsize < sizeof(domainset_t) || - domainsetsize > DOMAINSET_MAXSIZE / NBBY) - return (ERANGE); /* In Capability mode, you can only set your own CPU set. */ if (IN_CAPABILITY_MODE(td)) { if (level != CPU_LEVEL_WHICH) @@ -2047,43 +2100,13 @@ if (id != -1) return (ECAPMODE); } - memset(&domain, 0, sizeof(domain)); - mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO); - error = copyin(maskp, mask, domainsetsize); - if (error) - goto out; - /* - * Verify that no high bits are set. - */ - if (domainsetsize > sizeof(domainset_t)) { - char *end; - char *cp; - end = cp = (char *)&mask->__bits; - end += domainsetsize; - cp += sizeof(domainset_t); - while (cp != end) - if (*cp++ != 0) { - error = EINVAL; - goto out; - } - - } - DOMAINSET_COPY(mask, &domain.ds_mask); - domain.ds_policy = policy; - if (policy <= DOMAINSET_POLICY_INVALID || - policy > DOMAINSET_POLICY_MAX) - return (EINVAL); - - /* Translate preferred policy into a mask and fallback. */ - if (policy == DOMAINSET_POLICY_PREFER) { - /* Only support a single preferred domain. */ - if (DOMAINSET_COUNT(&domain.ds_mask) != 1) - return (EINVAL); - domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1; - /* This will be constrained by domainset_shadow(). */ + error = domainset_copyin(&domain, domainsetsize, maskp, policy); + if (error) + return (error); + /* This will be constrained by cpuset_shadow(). */ + if (policy == DOMAINSET_POLICY_PREFER) DOMAINSET_FILL(&domain.ds_mask); - } switch (level) { case CPU_LEVEL_ROOT: @@ -2146,12 +2169,106 @@ break; } out: - free(mask, M_TEMP); return (error); } -#ifdef DDB +#ifndef _SYS_SYSPROTO_H_ +struct msetdomain_args { + void *addr; + size_t size; + size_t domainsetsize; + domainset_t *mask; + int policy; + int flags; +}; +#endif +int +sys_msetdomain(struct thread *td, struct msetdomain_args *uap) +{ + return (kern_msetdomain(td, (uintptr_t)uap->addr, uap->size, + uap->domainsetsize, uap->mask, uap->policy, uap->flags)); +} + +int +kern_msetdomain(struct thread *td, uintptr_t addr0, size_t size, + size_t domainsetsize, const domainset_t *mask, int policy, int flags) +{ + struct domainset domain, *set, *nset; + struct cpuset *cset; + struct thread *ttd; + struct proc *p; + vm_offset_t addr; + vm_size_t pageoff; + int error; + + /* Normalize the addresses. */ + addr = trunc_page(addr0); + pageoff = (addr & PAGE_MASK); + addr -= pageoff; + size += pageoff; + size = (vm_size_t)round_page(size); + if (addr + size < addr) + return (EINVAL); + + /* Short-circuit for POLICY_INVALID == reset to default. */ + if (policy == DOMAINSET_POLICY_INVALID) { + nset = NULL; + goto apply; + } + + /* + * Copy in and initialize the domainset from the user arguments. + */ + error = domainset_copyin(&domain, domainsetsize, mask, policy); + if (error) + return (error); + + /* + * Grab the list of allowed domains from the numbered cpuset this + * process is a member of. + */ + error = cpuset_which(CPU_WHICH_PID, -1, &p, &ttd, &cset); + if (error) + return (error); + thread_lock(ttd); + set = cpuset_getbase(ttd->td_cpuset)->cs_domain; + thread_unlock(ttd); + PROC_UNLOCK(p); + + /* + * Validate the new policy against the allowed set. + */ + if (policy == DOMAINSET_POLICY_PREFER) + DOMAINSET_COPY(&set->ds_mask, &domain.ds_mask); + if (!domainset_valid(set, &domain)) + return (EINVAL); + + /* + * Attempt to create a new set based on this key. + */ + nset = domainset_create(&domain); + if (nset == NULL) + return (EINVAL); + + /* + * Attempt to apply the new set to the memory range. + */ +apply: + switch (vm_map_setdomain(&td->td_proc->p_vmspace->vm_map, addr, + addr + size, nset, flags)) { + case KERN_SUCCESS: + break; + case KERN_INVALID_ADDRESS: + return (EFAULT); + default: + return (EINVAL); + } + + return (0); +} + +#ifdef DDB static void ddb_display_bitset(const struct bitset *set, int size) { Index: vm/vm_fault.c =================================================================== --- vm/vm_fault.c +++ vm/vm_fault.c @@ -1609,7 +1609,6 @@ KASSERT(upgrade || dst_entry->object.vm_object == NULL, ("vm_fault_copy_entry: vm_object not NULL")); if (src_object != dst_object) { - dst_object->domain = src_object->domain; dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; dst_object->charge = dst_entry->end - dst_entry->start; Index: vm/vm_map.h =================================================================== --- vm/vm_map.h +++ vm/vm_map.h @@ -403,5 +403,8 @@ int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); long vmspace_swap_count(struct vmspace *vmspace); +struct domainset; +int vm_map_setdomain(vm_map_t, vm_offset_t, vm_offset_t, + struct domainset *, int); #endif /* _KERNEL */ #endif /* _VM_MAP_ */ Index: vm/vm_map.c =================================================================== --- vm/vm_map.c +++ vm/vm_map.c @@ -69,6 +69,7 @@ #include #include +#include #include #include #include @@ -848,6 +849,34 @@ } /* + * vm_map_entry_object_allocate: [ internal use only ] + * + * Returns the object associated with a map entry, allocating + * a default object if non presently exists. + */ +static vm_object_t +vm_map_entry_object_allocate(vm_map_t map, vm_map_entry_t entry) +{ + vm_object_t object; + + VM_MAP_ASSERT_LOCKED(map); + if (entry->object.vm_object != NULL) + return (entry->object.vm_object); + + object = vm_object_allocate(OBJT_DEFAULT, + atop(entry->end - entry->start)); + entry->object.vm_object = object; + entry->offset = 0; + if (entry->cred != NULL) { + object->cred = entry->cred; + object->charge = entry->end - entry->start; + entry->cred = NULL; + } + + return (object); +} + +/* * vm_map_entry_set_behavior: * * Set the expected access behavior, either normal, random, or @@ -1773,16 +1802,7 @@ */ if (entry->object.vm_object == NULL && !map->system_map && (entry->eflags & MAP_ENTRY_GUARD) == 0) { - vm_object_t object; - object = vm_object_allocate(OBJT_DEFAULT, - atop(entry->end - entry->start)); - entry->object.vm_object = object; - entry->offset = 0; - if (entry->cred != NULL) { - object->cred = entry->cred; - object->charge = entry->end - entry->start; - entry->cred = NULL; - } + vm_map_entry_object_allocate(map, entry); } else if (entry->object.vm_object != NULL && ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) && entry->cred != NULL) { @@ -1853,16 +1873,7 @@ */ if (entry->object.vm_object == NULL && !map->system_map && (entry->eflags & MAP_ENTRY_GUARD) == 0) { - vm_object_t object; - object = vm_object_allocate(OBJT_DEFAULT, - atop(entry->end - entry->start)); - entry->object.vm_object = object; - entry->offset = 0; - if (entry->cred != NULL) { - object->cred = entry->cred; - object->charge = entry->end - entry->start; - entry->cred = NULL; - } + vm_map_entry_object_allocate(map, entry); } else if (entry->object.vm_object != NULL && ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) && entry->cred != NULL) { @@ -3449,21 +3460,11 @@ case VM_INHERIT_SHARE: /* - * Clone the entry, creating the shared object if necessary. + * Clone the entry, creating the shared object if + * necessary. */ - object = old_entry->object.vm_object; - if (object == NULL) { - object = vm_object_allocate(OBJT_DEFAULT, - atop(old_entry->end - old_entry->start)); - old_entry->object.vm_object = object; - old_entry->offset = 0; - if (old_entry->cred != NULL) { - object->cred = old_entry->cred; - object->charge = old_entry->end - - old_entry->start; - old_entry->cred = NULL; - } - } + object = vm_map_entry_object_allocate(old_map, + old_entry); /* * Add the reference before calling vm_object_shadow @@ -4195,16 +4196,7 @@ !map->system_map) { if (vm_map_lock_upgrade(map)) goto RetryLookup; - entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT, - atop(size)); - entry->offset = 0; - if (entry->cred != NULL) { - VM_OBJECT_WLOCK(entry->object.vm_object); - entry->object.vm_object->cred = entry->cred; - entry->object.vm_object->charge = size; - VM_OBJECT_WUNLOCK(entry->object.vm_object); - entry->cred = NULL; - } + vm_map_entry_object_allocate(map, entry); vm_map_lock_downgrade(map); } @@ -4313,6 +4305,121 @@ * Unlock the main-level map */ vm_map_unlock_read(map); +} + +/* + * vm_map_setdomain: + * + * Assigns the NUMA policy contained in 'domain' to all objects + * overlapping the requested address range. + */ +int +vm_map_setdomain(vm_map_t map, vm_offset_t start, vm_offset_t end, + struct domainset *domain, int flags) +{ + vm_map_entry_t current, entry; + vm_object_t object; + int error; + + error = KERN_SUCCESS; + vm_map_lock(map); + if (start < vm_map_min(map) || end > vm_map_max(map) || + start >= end || map->system_map) { + printf("%s:%d\n", __FILE__, __LINE__); + error = KERN_INVALID_ADDRESS; + goto out; + } + + /* + * Locate starting entry and clip if necessary. + */ + if (!vm_map_lookup_entry(map, start, &entry)) { + printf("%s:%d\n", __FILE__, __LINE__); + error = KERN_INVALID_ADDRESS; + goto out; + } + if (entry->start > start) { + printf("%s:%d\n", __FILE__, __LINE__); + error = KERN_INVALID_ADDRESS; + goto out; + } + vm_map_clip_start(map, entry, start); + + /* + * Walk the range looking for holes before we apply policy. + */ + for (current = entry; + (current != &map->header) && (current->start < end); + current = current->next + ) { + if (current->end >= end) + break; + /* We don't support gaps. */ + if (current->end != current->next->start) { + printf("%s:%d\n", __FILE__, __LINE__); + error = KERN_INVALID_ADDRESS; + goto out; + } + } + + /* + * Walk each overlapping map entry and update the backing + * object's memory policy. + */ + for (current = entry; + (current != &map->header) && (current->start < end); + current = current->next + ) { + /* Skip incompatible entries. */ + if ((current->eflags & + (MAP_ENTRY_GUARD | MAP_ENTRY_IS_SUB_MAP)) != 0) + continue; + + /* + * Clip the end and allocate the object so that we are + * only modifying the requested range. + */ + vm_map_clip_end(map, current, end); + object = vm_map_entry_object_allocate(map, current); + if (current->eflags & MAP_ENTRY_NEEDS_COPY) { + vm_object_shadow(¤t->object.vm_object, + ¤t->offset, current->end - current->start); + current->eflags &= ~MAP_ENTRY_NEEDS_COPY; + object = current->object.vm_object; + } + + /* + * If the object is anonymous memory we need to split it + * so that we can apply the unique alloction property to + * this range. + */ + VM_OBJECT_WLOCK(object); + if (object->type == OBJT_DEFAULT || + object->type == OBJT_SWAP) { + vm_object_collapse(object); + if ((object->flags & OBJ_NOSPLIT) == 0) { + vm_object_split(current); + object = current->object.vm_object; + } + } + /* + * Linux does not allow this to be applied to anything but + * private mappings and other anonymous memory. Should we? + * XXX + */ + object->domain.dr_policy = domain; + VM_OBJECT_WUNLOCK(object); + + /* + * XXX This simplify is probably not fruitful should I elide + * it? + */ + vm_map_simplify_entry(map, current); + } +out: + vm_map_unlock(map); + + return (error); } #include "opt_ddb.h" Index: vm/vm_object.c =================================================================== --- vm/vm_object.c +++ vm/vm_object.c @@ -1328,7 +1328,6 @@ result->backing_object_offset = *offset; if (source != NULL) { VM_OBJECT_WLOCK(source); - result->domain = source->domain; LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list); source->shadow_count++; #if VM_NRESERVLEVEL > 0