Index: sys/sys/malloc.h =================================================================== --- sys/sys/malloc.h +++ sys/sys/malloc.h @@ -53,6 +53,7 @@ */ #define M_NOWAIT 0x0001 /* do not block */ #define M_WAITOK 0x0002 /* ok to block */ +#define M_STABLE 0x0004 /* memory is unlikely to be reused */ #define M_ZERO 0x0100 /* bzero the allocation */ #define M_NOVM 0x0200 /* don't ask VM for pages */ #define M_USE_RESERVE 0x0400 /* can alloc out of reserve memory */ Index: sys/vm/uma_core.c =================================================================== --- sys/vm/uma_core.c +++ sys/vm/uma_core.c @@ -1506,7 +1506,6 @@ int aflags) { uma_domain_t dom; - uma_alloc allocf; uma_slab_t slab; unsigned long size; uint8_t *mem; @@ -1516,7 +1515,6 @@ KASSERT(domain >= 0 && domain < vm_ndomains, ("keg_alloc_slab: domain %d out of range", domain)); - allocf = keg->uk_allocf; slab = NULL; mem = NULL; if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) { @@ -1540,12 +1538,14 @@ else aflags &= ~M_ZERO; - if (keg->uk_flags & UMA_ZONE_NODUMP) + if ((keg->uk_flags & UMA_ZONE_NODUMP) != 0) aflags |= M_NODUMP; + if ((keg->uk_flags & UMA_ZONE_NOFREE) != 0) + aflags |= M_STABLE; /* zone is passed for legacy reasons. */ size = keg->uk_ppera * PAGE_SIZE; - mem = allocf(zone, size, domain, &sflags, aflags); + mem = keg->uk_allocf(zone, size, domain, &sflags, aflags); if (mem == NULL) { if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) zone_free_item(slabzone(keg->uk_ipers), @@ -1673,26 +1673,14 @@ } } -/* - * Allocates a number of pages from the system - * - * Arguments: - * bytes The number of bytes requested - * wait Shall we wait? - * - * Returns: - * A pointer to the alloced memory or possibly - * NULL if M_NOWAIT is set. - */ static void * page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, - int wait) + int flags) { - void *p; /* Returned page */ + void *p; *pflag = UMA_SLAB_KERNEL; - p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait); - + p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, flags); return (p); } @@ -1817,12 +1805,12 @@ */ static void * contig_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, - int wait) + int flags) { *pflag = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain), - bytes, wait, 0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT)); + bytes, flags, 0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT)); } /* @@ -2249,7 +2237,7 @@ * startup cache until the vm is ready. */ #ifdef UMA_MD_SMALL_ALLOC - if (keg->uk_ppera == 1) + if (keg->uk_ppera == 1 && (keg->uk_flags & UMA_ZONE_NOFREE) == 0) keg->uk_allocf = uma_small_alloc; else #endif Index: sys/vm/vm_kern.c =================================================================== --- sys/vm/vm_kern.c +++ sys/vm/vm_kern.c @@ -121,6 +121,15 @@ #endif "Max kernel address"); +static SYSCTL_NODE(_vm_stats, OID_AUTO, kern, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + "Kernel memory statistics"); + +static counter_u64_t nofree_import_failures = EARLY_COUNTER; +COUNTER_U64_SYSINIT(nofree_import_failures); +SYSCTL_COUNTER_U64(_vm_stats_kern, OID_AUTO, nofree_import_failures, + CTLFLAG_RD, &nofree_import_failures, + "Number of failed M_STABLE imports"); + #if VM_NRESERVLEVEL > 0 #define KVA_QUANTUM_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT) #else @@ -169,6 +178,30 @@ vmem_free(kernel_arena, addr, size); } +#if VM_NRESERVLEVEL > 0 +/* + * Allocate kernel memory to satisfy a M_STABLE request, meaning that the + * memory is expected never to be freed. Such allocations are physically + * contiguous so as to minimize fragmentation. + */ +static vm_offset_t +kmem_alloc_nofree(int domain, vm_size_t size, u_long align, int flags) +{ + vm_offset_t addr; + + KASSERT(size + align <= KVA_QUANTUM, + ("%s: unhandled size %#lx and alignment %#lx", + __func__, size, align)); + + if (vmem_xalloc(vm_dom[domain].vmd_kernel_nofree_arena, size, align, + 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags | M_BESTFIT, &addr) != 0) + return (0); + if ((flags & M_ZERO) != 0) + memset((void *)addr, 0, size); + return (addr); +} +#endif + /* * Allocates a region from the kernel address map and physical pages * within the specified address range to the kernel object. Creates a @@ -188,6 +221,9 @@ int pflags, tries; vm_prot_t prot; + KASSERT((flags & M_STABLE) == 0, + ("%s: does not handle M_STABLE allocations", __func__)); + size = round_page(size); vmem = vm_dom[domain].vmd_kernel_arena; if (vmem_alloc(vmem, size, M_BESTFIT | flags, &addr)) @@ -273,13 +309,24 @@ vm_memattr_t memattr) { vmem_t *vmem; - vm_object_t object = kernel_object; + vm_object_t object; vm_offset_t addr, offset, tmp; vm_page_t end_m, m; u_long npages; int pflags, tries; size = round_page(size); + if ((flags & M_STABLE) != 0) { + flags &= ~M_STABLE; +#if VM_NRESERVLEVEL > 0 + if (low == 0 && high == ~(vm_paddr_t)0 && boundary == 0) { + addr = kmem_alloc_nofree(domain, size, alignment, + flags); + if (addr != 0) + return (addr); + } +#endif + } vmem = vm_dom[domain].vmd_kernel_arena; if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr)) return (0); @@ -288,6 +335,7 @@ pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); pflags |= VM_ALLOC_NOWAIT; npages = atop(size); + object = kernel_object; VM_OBJECT_WLOCK(object); tries = 0; retry: @@ -402,15 +450,20 @@ vm_offset_t addr; int rv; -#if VM_NRESERVLEVEL > 0 - if (__predict_true((flags & M_EXEC) == 0)) + size = round_page(size); + if (__predict_true((flags & (M_EXEC | M_STABLE)) == 0)) arena = vm_dom[domain].vmd_kernel_arena; - else + else if ((flags & M_EXEC) != 0) arena = vm_dom[domain].vmd_kernel_rwx_arena; -#else - arena = vm_dom[domain].vmd_kernel_arena; + else { + flags &= ~M_STABLE; +#if VM_NRESERVLEVEL > 0 + addr = kmem_alloc_nofree(domain, size, PAGE_SIZE, flags); + if (addr != 0) + return (addr); #endif - size = round_page(size); + arena = vm_dom[domain].vmd_kernel_arena; + } if (vmem_alloc(arena, size, flags | M_BESTFIT, &addr)) return (0); @@ -462,7 +515,9 @@ int pflags; KASSERT(object == kernel_object, - ("kmem_back_domain: only supports kernel object.")); + ("%s: only supports kernel object", __func__)); + KASSERT((flags & M_STABLE) == 0, + ("%s: does not handle M_STABLE allocations", __func__)); offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_WIRED; @@ -501,10 +556,8 @@ vm_page_valid(m); pmap_enter(kernel_pmap, addr + i, m, prot, prot | PMAP_ENTER_WIRED, 0); -#if VM_NRESERVLEVEL > 0 if (__predict_false((prot & VM_PROT_EXECUTE) != 0)) m->oflags |= VPO_KMEM_EXEC; -#endif } VM_OBJECT_WUNLOCK(object); @@ -578,14 +631,10 @@ VM_OBJECT_WLOCK(object); m = vm_page_lookup(object, atop(offset)); domain = vm_phys_domain(m); -#if VM_NRESERVLEVEL > 0 if (__predict_true((m->oflags & VPO_KMEM_EXEC) == 0)) arena = vm_dom[domain].vmd_kernel_arena; else arena = vm_dom[domain].vmd_kernel_rwx_arena; -#else - arena = vm_dom[domain].vmd_kernel_arena; -#endif for (; offset < end; offset += PAGE_SIZE, m = next) { next = vm_page_next(m); vm_page_xbusy_claim(m); @@ -736,12 +785,37 @@ { KASSERT((size % KVA_QUANTUM) == 0, - ("kva_import_domain: Size %jd is not a multiple of %d", + ("%s: Size %jd is not a multiple of %d", __func__, (intmax_t)size, (int)KVA_QUANTUM)); return (vmem_xalloc(arena, size, KVA_QUANTUM, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, addrp)); } +#if VM_NRESERVLEVEL > 0 +static int +kva_import_nofree(void *arg, vmem_size_t size, int flags, vmem_addr_t *addrp) +{ + int domain; + + domain = (int)(uintptr_t)arg; + KASSERT(domain >= 0 && domain < vm_ndomains, + ("%s: Invalid domain index %d", __func__, domain)); + KASSERT((size % KVA_QUANTUM) == 0, + ("%s: Size %jd is not a multiple of %d", __func__, + (intmax_t)size, (int)KVA_QUANTUM)); + + /* XXX this does not necessarily give suitably aligned KVA */ + *addrp = (vmem_addr_t)kmem_alloc_contig_domain(domain, size, flags, + 0, ~(vm_paddr_t)0, KVA_QUANTUM, 0, VM_MEMATTR_DEFAULT); + if (*addrp == 0) { + counter_u64_add(nofree_import_failures, 1); + return (ENOMEM); + } else { + return (0); + } +} +#endif + /* * kmem_init: * @@ -804,9 +878,13 @@ /* * In architectures with superpages, maintain separate arenas - * for allocations with permissions that differ from the - * "standard" read/write permissions used for kernel memory, - * so as not to inhibit superpage promotion. + * for + * 1) allocations with permissions that differ from the + * "standard" read/write permissions used for kernel memory, + * and + * 2) allocations which are never going to be freed. + * + * This helps minimize fragmentation of physical memory. */ #if VM_NRESERVLEVEL > 0 vm_dom[domain].vmd_kernel_rwx_arena = vmem_create( @@ -814,6 +892,14 @@ vmem_set_import(vm_dom[domain].vmd_kernel_rwx_arena, kva_import_domain, (vmem_release_t *)vmem_xfree, kernel_arena, KVA_QUANTUM); + vm_dom[domain].vmd_kernel_nofree_arena = vmem_create( + "kernel nofree arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); + vmem_set_import(vm_dom[domain].vmd_kernel_nofree_arena, + kva_import_nofree, (vmem_release_t *)vmem_xfree, + (void *)(uintptr_t)domain, KVA_QUANTUM); +#else + vm_dom[domain].vmd_kernel_rwx_arena = kernel_arena; + vm_dom[domain].vmd_kernel_nofree_arena = kernel_arena; #endif } Index: sys/vm/vm_pagequeue.h =================================================================== --- sys/vm/vm_pagequeue.h +++ sys/vm/vm_pagequeue.h @@ -242,8 +242,9 @@ int pool; uma_zone_t zone; } vmd_pgcache[VM_NFREEPOOL]; - struct vmem *vmd_kernel_arena; /* (c) per-domain kva R/W arena. */ - struct vmem *vmd_kernel_rwx_arena; /* (c) per-domain kva R/W/X arena. */ + struct vmem *vmd_kernel_arena; /* (c) per-domain KVA */ + struct vmem *vmd_kernel_rwx_arena; /* (c) per-domain RWX KVA */ + struct vmem *vmd_kernel_nofree_arena; /* (c) per-domain nofree KVA */ u_int vmd_domain; /* (c) Domain number. */ u_int vmd_page_count; /* (c) Total page count. */ long vmd_segs; /* (c) bitmask of the segments */