Index: share/man/man9/malloc.9 =================================================================== --- share/man/man9/malloc.9 +++ share/man/man9/malloc.9 @@ -29,7 +29,7 @@ .\" $NetBSD: malloc.9,v 1.3 1996/11/11 00:05:11 lukem Exp $ .\" $FreeBSD$ .\" -.Dd January 24, 2018 +.Dd June 8, 2018 .Dt MALLOC 9 .Os .Sh NAME @@ -189,6 +189,11 @@ .Dv M_NOWAIT when an allocation failure cannot be tolerated by the caller without catastrophic effects on the system. +.It Dv M_EXEC +Indicates that the system should allocate executable memory. +If this flag is not set, the system will not allocate executable memory. +Not all platforms enforce a distinction between executable and +non-executable memory. .El .Pp Exactly one of either Index: share/man/man9/zone.9 =================================================================== --- share/man/man9/zone.9 +++ share/man/man9/zone.9 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd April 26, 2017 +.Dd June 8, 2018 .Dt ZONE 9 .Os .Sh NAME @@ -375,6 +375,11 @@ and .Dv M_NOWAIT was specified. +.Sh IMPLEMENTATION NOTES +The memory that these allocation calls return is not executable. +The +.Fn uma_zalloc +function does not support the M_EXEC flag to allocate executable memory. .Sh SEE ALSO .Xr malloc 9 .Sh HISTORY Index: sys/amd64/amd64/bpf_jit_machdep.c =================================================================== --- sys/amd64/amd64/bpf_jit_machdep.c +++ sys/amd64/amd64/bpf_jit_machdep.c @@ -44,9 +44,6 @@ #include #include -#include -#include -#include #else #include #include @@ -605,11 +602,7 @@ *size = stream.cur_ip; #ifdef _KERNEL - /* - * We cannot use malloc(9) because DMAP is mapped as NX. - */ - stream.ibuf = (void *)kmem_malloc(kernel_arena, *size, - M_NOWAIT); + stream.ibuf = malloc(*size, M_BPFJIT, M_EXEC | M_NOWAIT); if (stream.ibuf == NULL) break; #else @@ -657,15 +650,4 @@ #endif return ((bpf_filter_func)(void *)stream.ibuf); -} - -void -bpf_jit_free(void *func, size_t size) -{ - -#ifdef _KERNEL - kmem_free(kernel_arena, (vm_offset_t)func, size); -#else - munmap(func, size); -#endif } Index: sys/i386/i386/bpf_jit_machdep.c =================================================================== --- sys/i386/i386/bpf_jit_machdep.c +++ sys/i386/i386/bpf_jit_machdep.c @@ -632,7 +632,7 @@ *size = stream.cur_ip; #ifdef _KERNEL - stream.ibuf = malloc(*size, M_BPFJIT, M_NOWAIT); + stream.ibuf = malloc(*size, M_BPFJIT, M_EXEC | M_NOWAIT); if (stream.ibuf == NULL) break; #else @@ -680,15 +680,4 @@ #endif return ((bpf_filter_func)(void *)stream.ibuf); -} - -void -bpf_jit_free(void *func, size_t size) -{ - -#ifdef _KERNEL - free(func, M_BPFJIT); -#else - munmap(func, size); -#endif } Index: sys/kern/kern_malloc.c =================================================================== --- sys/kern/kern_malloc.c +++ sys/kern/kern_malloc.c @@ -564,7 +564,7 @@ return (va); #endif - if (size <= kmem_zmax) { + if (size <= kmem_zmax && (flags & M_EXEC) == 0) { if (size & KMEM_ZMASK) size = (size & ~KMEM_ZMASK) + KMEM_ZBASE; indx = kmemsize[size >> KMEM_ZSHIFT]; @@ -609,7 +609,7 @@ if (malloc_dbg(&va, &size, mtp, flags) != 0) return (va); #endif - if (size <= kmem_zmax) { + if (size <= kmem_zmax && (flags & M_EXEC) == 0) { if (size & KMEM_ZMASK) size = (size & ~KMEM_ZMASK) + KMEM_ZBASE; indx = kmemsize[size >> KMEM_ZSHIFT]; Index: sys/kern/subr_vmem.c =================================================================== --- sys/kern/subr_vmem.c +++ sys/kern/subr_vmem.c @@ -241,6 +241,9 @@ static struct vmem transient_arena_storage; /* kernel and kmem arenas are aliased for backwards KPI compat. */ vmem_t *kernel_arena = &kernel_arena_storage; +#if VM_NRESERVLEVEL > 0 +vmem_t *kernel_rwx_arena = NULL; +#endif vmem_t *kmem_arena = &kernel_arena_storage; vmem_t *buffer_arena = &buffer_arena_storage; vmem_t *transient_arena = &transient_arena_storage; Index: sys/net/bpf_jitter.h =================================================================== --- sys/net/bpf_jitter.h +++ sys/net/bpf_jitter.h @@ -88,6 +88,5 @@ struct bpf_insn; bpf_filter_func bpf_jit_compile(struct bpf_insn *, u_int, size_t *); -void bpf_jit_free(void *, size_t); #endif /* _NET_BPF_JITTER_H_ */ Index: sys/net/bpf_jitter.c =================================================================== --- sys/net/bpf_jitter.c +++ sys/net/bpf_jitter.c @@ -101,11 +101,13 @@ bpf_destroy_jit_filter(bpf_jit_filter *filter) { - if (filter->func != bpf_jit_accept_all) - bpf_jit_free(filter->func, filter->size); #ifdef _KERNEL + if (filter->func != bpf_jit_accept_all) + free(filter->func, M_BPFJIT); free(filter, M_BPFJIT); #else + if (filter->func != bpf_jit_accept_all) + munmap(filter->func, filter->size); free(filter); #endif } Index: sys/sys/malloc.h =================================================================== --- sys/sys/malloc.h +++ sys/sys/malloc.h @@ -49,7 +49,7 @@ #define MINALLOCSIZE UMA_SMALLEST_UNIT /* - * flags to malloc. + * Flags to memory allocation functions. */ #define M_NOWAIT 0x0001 /* do not block */ #define M_WAITOK 0x0002 /* ok to block */ @@ -59,6 +59,7 @@ #define M_NODUMP 0x0800 /* don't dump pages in this allocation */ #define M_FIRSTFIT 0x1000 /* Only for vmem, fast fit. */ #define M_BESTFIT 0x2000 /* Only for vmem, low fragmentation. */ +#define M_EXEC 0x4000 /* allocate executable space. */ #define M_MAGIC 877983977 /* time when first defined :-) */ Index: sys/vm/uma.h =================================================================== --- sys/vm/uma.h +++ sys/vm/uma.h @@ -617,11 +617,12 @@ * These flags are setable in the allocf and visible in the freef. */ #define UMA_SLAB_BOOT 0x01 /* Slab alloced from boot pages */ +#define UMA_SLAB_KRWX 0x02 /* Slab alloced from kernel_rwx_arena */ #define UMA_SLAB_KERNEL 0x04 /* Slab alloced from kernel_map */ #define UMA_SLAB_PRIV 0x08 /* Slab alloced from priv allocator */ #define UMA_SLAB_OFFP 0x10 /* Slab is managed separately */ #define UMA_SLAB_MALLOC 0x20 /* Slab is a large malloc slab */ -/* 0x02, 0x40 and 0x80 are available */ +/* 0x40 and 0x80 are available */ /* * Used to pre-fill a zone with some number of items Index: sys/vm/uma_core.c =================================================================== --- sys/vm/uma_core.c +++ sys/vm/uma_core.c @@ -1167,7 +1167,7 @@ void *p; /* Returned page */ *pflag = UMA_SLAB_KERNEL; - p = (void *) kmem_malloc_domain(domain, bytes, wait); + p = (void *) kmem_malloc_domain(kernel_arena, domain, bytes, wait); return (p); } @@ -2280,6 +2280,7 @@ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "uma_zalloc_arg: zone \"%s\"", zone->uz_name); } + KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC")); KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("uma_zalloc_arg: called with spinlock or critical section held")); if (zone->uz_flags & UMA_ZONE_PCPU) @@ -3587,20 +3588,34 @@ void * uma_large_malloc_domain(vm_size_t size, int domain, int wait) { + struct vmem *arena; vm_offset_t addr; uma_slab_t slab; +#if VM_NRESERVLEVEL > 0 + if (__predict_true((wait & M_EXEC) == 0)) + arena = kernel_arena; + else + arena = kernel_rwx_arena; +#else + arena = kernel_arena; +#endif + slab = zone_alloc_item(slabzone, NULL, domain, wait); if (slab == NULL) return (NULL); if (domain == UMA_ANYDOMAIN) - addr = kmem_malloc(kernel_arena, size, wait); + addr = kmem_malloc(arena, size, wait); else - addr = kmem_malloc_domain(domain, size, wait); + addr = kmem_malloc_domain(arena, domain, size, wait); if (addr != 0) { vsetslab(addr, slab); slab->us_data = (void *)addr; slab->us_flags = UMA_SLAB_KERNEL | UMA_SLAB_MALLOC; +#if VM_NRESERVLEVEL > 0 + if (__predict_false(arena == kernel_rwx_arena)) + slab->us_flags |= UMA_SLAB_KRWX; +#endif slab->us_size = size; slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE( pmap_kextract(addr))); @@ -3622,10 +3637,19 @@ void uma_large_free(uma_slab_t slab) { + struct vmem *arena; KASSERT((slab->us_flags & UMA_SLAB_KERNEL) != 0, ("uma_large_free: Memory not allocated with uma_large_malloc.")); - kmem_free(kernel_arena, (vm_offset_t)slab->us_data, slab->us_size); +#if VM_NRESERVLEVEL > 0 + if (__predict_true((slab->us_flags & UMA_SLAB_KRWX) == 0)) + arena = kernel_arena; + else + arena = kernel_rwx_arena; +#else + arena = kernel_arena; +#endif + kmem_free(arena, (vm_offset_t)slab->us_data, slab->us_size); uma_total_dec(slab->us_size); zone_free_item(slabzone, slab, NULL, SKIP_NONE); } Index: sys/vm/vm_extern.h =================================================================== --- sys/vm/vm_extern.h +++ sys/vm/vm_extern.h @@ -65,7 +65,8 @@ vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr); vm_offset_t kmem_malloc(struct vmem *, vm_size_t size, int flags); -vm_offset_t kmem_malloc_domain(int domain, vm_size_t size, int flags); +vm_offset_t kmem_malloc_domain(struct vmem *, int domain, vm_size_t size, + int flags); void kmem_free(struct vmem *, vm_offset_t, vm_size_t); /* This provides memory for previously allocated address space. */ Index: sys/vm/vm_init.c =================================================================== --- sys/vm/vm_init.c +++ sys/vm/vm_init.c @@ -135,7 +135,24 @@ return (0); } +#if VM_NRESERVLEVEL > 0 /* + * Import a superpage from the normal kernel arena into the special + * arena for allocations with different permissions. + */ +static int +kernel_rwx_alloc(void *arena, vmem_size_t size, int flags, vmem_addr_t *addrp) +{ + + KASSERT((size % KVA_QUANTUM) == 0, + ("kernel_rwx_alloc: Size %jd is not a multiple of %d", + (intmax_t)size, (int)KVA_QUANTUM)); + return (vmem_xalloc(arena, size, KVA_QUANTUM, 0, 0, VMEM_ADDR_MIN, + VMEM_ADDR_MAX, flags, addrp)); +} +#endif + +/* * vm_init initializes the virtual memory system. * This is done only by the first cpu up. * @@ -173,12 +190,31 @@ vmem_init(kernel_arena, "kernel arena", 0, 0, PAGE_SIZE, 0, 0); vmem_set_import(kernel_arena, kva_import, NULL, NULL, KVA_QUANTUM); +#if VM_NRESERVLEVEL > 0 + /* + * In an architecture with superpages, maintain a separate arena + * for allocations with permissions that differ from the "standard" + * read/write permissions used for memory in the kernel_arena. + */ + kernel_rwx_arena = vmem_create("kernel rwx arena", 0, 0, PAGE_SIZE, + 0, M_WAITOK); + vmem_set_import(kernel_rwx_arena, kernel_rwx_alloc, + (vmem_release_t *)vmem_xfree, kernel_arena, KVA_QUANTUM); +#endif + for (domain = 0; domain < vm_ndomains; domain++) { vm_dom[domain].vmd_kernel_arena = vmem_create( "kernel arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); vmem_set_import(vm_dom[domain].vmd_kernel_arena, (vmem_import_t *)vmem_alloc, NULL, kernel_arena, KVA_QUANTUM); +#if VM_NRESERVLEVEL > 0 + vm_dom[domain].vmd_kernel_rwx_arena = vmem_create( + "kernel rwx arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); + vmem_set_import(vm_dom[domain].vmd_kernel_rwx_arena, + kernel_rwx_alloc, (vmem_release_t *)vmem_xfree, + vm_dom[domain].vmd_kernel_arena, KVA_QUANTUM); +#endif } #ifndef UMA_MD_SMALL_ALLOC Index: sys/vm/vm_kern.h =================================================================== --- sys/vm/vm_kern.h +++ sys/vm/vm_kern.h @@ -70,6 +70,7 @@ extern vm_map_t exec_map; extern vm_map_t pipe_map; extern struct vmem *kernel_arena; +extern struct vmem *kernel_rwx_arena; extern struct vmem *kmem_arena; extern struct vmem *buffer_arena; extern struct vmem *transient_arena; Index: sys/vm/vm_kern.c =================================================================== --- sys/vm/vm_kern.c +++ sys/vm/vm_kern.c @@ -212,8 +212,8 @@ if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; - pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL, - VM_PROT_ALL | PMAP_ENTER_WIRED, 0); + pmap_enter(kernel_pmap, addr + i, m, VM_PROT_RW, + VM_PROT_RW | PMAP_ENTER_WIRED, 0); } VM_OBJECT_WUNLOCK(object); return (addr); @@ -298,8 +298,8 @@ if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; - pmap_enter(kernel_pmap, tmp, m, VM_PROT_ALL, - VM_PROT_ALL | PMAP_ENTER_WIRED, 0); + pmap_enter(kernel_pmap, tmp, m, VM_PROT_RW, + VM_PROT_RW | PMAP_ENTER_WIRED, 0); tmp += PAGE_SIZE; } VM_OBJECT_WUNLOCK(object); @@ -372,20 +372,42 @@ * Allocate wired-down pages in the kernel's address space. */ vm_offset_t -kmem_malloc_domain(int domain, vm_size_t size, int flags) +kmem_malloc_domain(struct vmem *vmem, int domain, vm_size_t size, int flags) { - vmem_t *vmem; + vmem_t *arena; vm_offset_t addr; int rv; - vmem = vm_dom[domain].vmd_kernel_arena; +#if VM_NRESERVLEVEL > 0 + KASSERT(vmem == kernel_arena || vmem == kernel_rwx_arena, + ("kmem_malloc_domain: Only kernel_arena or kernel_rwx_arena " + "are supported.")); + if (__predict_true(vmem == kernel_arena)) + arena = vm_dom[domain].vmd_kernel_arena; + else + arena = vm_dom[domain].vmd_kernel_rwx_arena; +#else + KASSERT(vmem == kernel_arena, + ("kmem_malloc_domain: Only kernel_arena is supported.")); + arena = vm_dom[domain].vmd_kernel_arena; +#endif size = round_page(size); - if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr)) + if (vmem_alloc(arena, size, flags | M_BESTFIT, &addr)) return (0); + if (vmem == kernel_rwx_arena) { + struct stack st; + + printf("%s(%d)", curproc->p_comm, curproc->p_pid); + printf("allocated %lu bytes at %p (%x)\n", size, (void *)addr, flags); + printf("p=%p, t=%p:\n", curproc, curthread); + stack_zero(&st); + stack_save(&st); + stack_print_ddb(&st); + } rv = kmem_back_domain(domain, kernel_object, addr, size, flags); if (rv != KERN_SUCCESS) { - vmem_free(vmem, addr, size); + vmem_free(arena, addr, size); return (0); } return (addr); @@ -398,12 +420,9 @@ vm_offset_t addr; int domain; - KASSERT(vmem == kernel_arena, - ("kmem_malloc: Only kernel_arena is supported.")); - vm_domainset_iter_malloc_init(&di, kernel_object, &domain, &flags); do { - addr = kmem_malloc_domain(domain, size, flags); + addr = kmem_malloc_domain(vmem, domain, size, flags); if (addr != 0) break; } while (vm_domainset_iter_malloc(&di, &domain, &flags) == 0); @@ -422,6 +441,7 @@ { vm_offset_t offset, i; vm_page_t m, mpred; + vm_prot_t prot; int pflags; KASSERT(object == kernel_object, @@ -432,6 +452,7 @@ pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); if (flags & M_WAITOK) pflags |= VM_ALLOC_WAITFAIL; + prot = (flags & M_EXEC) != 0 ? VM_PROT_ALL : VM_PROT_RW; i = 0; VM_OBJECT_WLOCK(object); @@ -461,8 +482,8 @@ KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("kmem_malloc: page %p is managed", m)); m->valid = VM_PAGE_BITS_ALL; - pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL, - VM_PROT_ALL | PMAP_ENTER_WIRED, 0); + pmap_enter(kernel_pmap, addr + i, m, prot, + prot | PMAP_ENTER_WIRED, 0); } VM_OBJECT_WUNLOCK(object); @@ -542,13 +563,28 @@ void kmem_free(struct vmem *vmem, vm_offset_t addr, vm_size_t size) { + struct vmem *arena; int domain; +#if VM_NRESERVLEVEL > 0 + KASSERT(vmem == kernel_arena || vmem == kernel_rwx_arena, + ("kmem_free: Only kernel_arena or kernel_rwx_arena are supported.")); +#else KASSERT(vmem == kernel_arena, ("kmem_free: Only kernel_arena is supported.")); +#endif + size = round_page(size); domain = _kmem_unback(kernel_object, addr, size); - vmem_free(vm_dom[domain].vmd_kernel_arena, addr, size); +#if VM_NRESERVLEVEL > 0 + if (__predict_true(vmem == kernel_arena)) + arena = vm_dom[domain].vmd_kernel_arena; + else + arena = vm_dom[domain].vmd_kernel_rwx_arena; +#else + arena = vm_dom[domain].vmd_kernel_arena; +#endif + vmem_free(arena, addr, size); } /* Index: sys/vm/vm_pagequeue.h =================================================================== --- sys/vm/vm_pagequeue.h +++ sys/vm/vm_pagequeue.h @@ -86,6 +86,10 @@ #include struct sysctl_oid; +#ifndef _VM_PARAM_ +#err "Requires " +#endif + /* * One vm_domain per-numa domain. Contains pagequeues, free page structures, * and accounting. @@ -103,7 +107,10 @@ struct mtx_padalign vmd_free_mtx; struct mtx_padalign vmd_pageout_mtx; uma_zone_t vmd_pgcache; /* (c) page free cache. */ - struct vmem *vmd_kernel_arena; /* (c) per-domain kva arena. */ + struct vmem *vmd_kernel_arena; /* (c) per-domain kva R/W arena. */ +#if VM_NRESERVLEVEL > 0 + struct vmem *vmd_kernel_rwx_arena; /* (c) per-domain kva R/W/X arena. */ +#endif u_int vmd_domain; /* (c) Domain number. */ u_int vmd_page_count; /* (c) Total page count. */ long vmd_segs; /* (c) bitmask of the segments */