Index: TODO =================================================================== --- /dev/null +++ TODO @@ -0,0 +1,5 @@ +- per-user limit on the total superpages allocations +- man pages +- export shm_open2(2) from libc.so ? +- make pmap_superpagesizes[] per-pmap ? +- more test programs Index: lib/libc/sys/Symbol.map =================================================================== --- lib/libc/sys/Symbol.map +++ lib/libc/sys/Symbol.map @@ -411,6 +411,7 @@ getfhat; funlinkat; memfd_create; + shm_open_largepage; shm_rename; }; @@ -919,6 +920,7 @@ __sys_setuid; _shm_open; __sys_shm_open; + __sys_shm_open2; _shm_unlink; __sys_shm_unlink; _shmat; Index: lib/libc/sys/shm_open.c =================================================================== --- lib/libc/sys/shm_open.c +++ lib/libc/sys/shm_open.c @@ -31,14 +31,17 @@ #include __FBSDID("$FreeBSD$"); -#include +#include +#include #include #include #include #include +#include #include #include +#include #include "libc_private.h" @@ -54,6 +57,51 @@ return (__sys_shm_open2(path, flags | O_CLOEXEC, mode, 0, NULL)); } +int +shm_open_largepage(const char *path, int flags, int psind, int alloc_policy, + mode_t mode) +{ + struct shm_largepage_conf slc; + int error, fd, saved_errno; + + fd = __sys_shm_open2(path, flags, mode, SHM_LARGEPAGE, NULL); + if (error == -1) + return (-1); + + memset(&slc, 0, sizeof(slc)); + slc.psind = psind; + slc.alloc_policy = alloc_policy; + error = ioctl(fd, FIOSHMLPGCNF, &slc); + if (error == -1) { + saved_errno = errno; + close(fd); + errno = saved_errno; + return (-1); + } + return (fd); +} + +#define K(x) ((size_t)(x) * 1024) +#define M(x) (K(x) * 1024) +#define G(x) (M(x) * 1024) +static const struct { + int mask; + size_t pgsize; +} mfd_huge_sizes[] = { + { .mask = MFD_HUGE_64KB, .pgsize = K(64) }, + { .mask = MFD_HUGE_512KB, .pgsize = K(512) }, + { .mask = MFD_HUGE_1MB, .pgsize = M(1) }, + { .mask = MFD_HUGE_2MB, .pgsize = M(2) }, + { .mask = MFD_HUGE_8MB, .pgsize = M(8) }, + { .mask = MFD_HUGE_16MB, .pgsize = M(16) }, + { .mask = MFD_HUGE_32MB, .pgsize = M(32) }, + { .mask = MFD_HUGE_256MB, .pgsize = M(256) }, + { .mask = MFD_HUGE_512MB, .pgsize = M(512) }, + { .mask = MFD_HUGE_1GB, .pgsize = G(1) }, + { .mask = MFD_HUGE_2GB, .pgsize = G(2) }, + { .mask = MFD_HUGE_16GB, .pgsize = G(16) }, +}; + /* * The path argument is passed to the kernel, but the kernel doesn't currently * do anything with it. Linux exposes it in linprocfs for debugging purposes @@ -63,8 +111,9 @@ memfd_create(const char *name, unsigned int flags) { char memfd_name[NAME_MAX + 1]; - size_t namelen; - int oflags, shmflags; + size_t namelen, *pgs; + struct shm_largepage_conf slc; + int error, fd, i, npgs, oflags, pgidx, saved_errno, shmflags; if (name == NULL) return (EBADF); @@ -75,11 +124,9 @@ MFD_HUGE_MASK)) != 0) return (EINVAL); /* Size specified but no HUGETLB. */ - if ((flags & MFD_HUGE_MASK) != 0 && (flags & MFD_HUGETLB) == 0) + if (((flags & MFD_HUGE_MASK) != 0 && (flags & MFD_HUGETLB) == 0) || + __bitcount(flags & MFD_HUGE_MASK) > 1) return (EINVAL); - /* We don't actually support HUGETLB. */ - if ((flags & MFD_HUGETLB) != 0) - return (ENOSYS); /* We've already validated that we're sufficiently sized. */ snprintf(memfd_name, NAME_MAX + 1, "%s%s", MEMFD_NAME_PREFIX, name); @@ -89,5 +136,57 @@ oflags |= O_CLOEXEC; if ((flags & MFD_ALLOW_SEALING) != 0) shmflags |= SHM_ALLOW_SEALING; - return (__sys_shm_open2(SHM_ANON, oflags, 0, shmflags, memfd_name)); + if ((flags & MFD_HUGETLB) == 0) + shmflags |= SHM_LARGEPAGE; + fd = __sys_shm_open2(SHM_ANON, oflags, 0, shmflags, memfd_name); + if (fd == -1 || (flags & MFD_HUGETLB) == 0) + return (fd); + + pgs = NULL; + npgs = getpagesizes(NULL, 0); + if (npgs == -1) + goto clean; + pgs = calloc(npgs, sizeof(size_t)); + if (pgs == NULL) + goto clean; + error = getpagesizes(pgs, npgs); + if (error == -1) + goto clean; + if ((flags & MFD_HUGE_MASK) == 0) { + if (npgs == 1) { + errno = EOPNOTSUPP; + goto clean; + } + pgidx = 1; + } else { + for (i = 0; i < nitems(mfd_huge_sizes); i++) { + if (mfd_huge_sizes[i].mask == (flags & MFD_HUGE_MASK)) + break; + } + for (pgidx = 0; pgidx < npgs; pgidx++) { + if (mfd_huge_sizes[i].pgsize == pgs[pgidx]) + break; + } + if (pgidx == npgs) { + errno = EOPNOTSUPP; + goto clean; + } + } + free(pgs); + pgs = NULL; + + memset(&slc, 0, sizeof(slc)); + slc.psind = pgidx; + slc.alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT; + error = ioctl(fd, FIOSHMLPGCNF, &slc); + if (error == -1) + goto clean; + return (fd); + +clean: + saved_errno = errno; + close(fd); + free(pgs); + errno = saved_errno; + return (-1); } Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -48,7 +48,7 @@ */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. - * Copyright (c) 2014-2019 The FreeBSD Foundation + * Copyright (c) 2014-2020 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, @@ -1334,6 +1334,8 @@ pdpe = pmap_pdpe(pmap, va); if (pdpe == NULL || (*pdpe & PG_V) == 0) return (NULL); + KASSERT((*pdpe & PG_PS) == 0, + ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va)); return (pmap_pdpe_to_pde(pdpe, va)); } @@ -2141,6 +2143,11 @@ KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; + if ((amd_feature & AMDID_PAGE1GB) != 0) { + KASSERT(MAXPAGESIZES > 1 && pagesizes[2] == 0, + ("pmap_init: can't assign to pagesizes[2]")); + pagesizes[2] = NBPDP; + } } /* @@ -5447,6 +5454,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; + vm_page_t mt; vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; @@ -5499,13 +5507,28 @@ } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); + va_next = (sva + NBPDP) & ~PDPMASK; if ((*pdpe & PG_V) == 0) { - va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } + KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva, + ("pmap_remove of non-transient 1G page " + "pdpe %#lx sva %#lx eva %#lx va_next %#lx", + *pdpe, sva, eva, va_next)); + if ((*pdpe & PG_PS) != 0) { + MPASS(pmap != kernel_pmap); /* XXXKIB */ + MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); + anyvalid = 1; + *pdpe = 0; + pmap_resident_count_dec(pmap, NBPDP / PAGE_SIZE); + mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME); + pmap_unwire_ptp(pmap, sva, mt, &free); + continue; + } + /* * Calculate index for next page table. */ @@ -5721,11 +5744,13 @@ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { + vm_page_t m; vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; + pt_entry_t obits, pbits; boolean_t anychanged; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); @@ -5776,13 +5801,36 @@ } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); + va_next = (sva + NBPDP) & ~PDPMASK; if ((*pdpe & PG_V) == 0) { - va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } + KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva, + ("pmap_remove of non-transient 1G page " + "pdpe %#lx sva %#lx eva %#lx va_next %#lx", + *pdpe, sva, eva, va_next)); + if ((*pdpe & PG_PS) != 0) { +retry_pdpe: + obits = pbits = *pdpe; + MPASS((pbits & (PG_MANAGED | PG_G)) == 0); + MPASS(pmap != kernel_pmap); /* XXXKIB */ + if ((prot & VM_PROT_WRITE) == 0) + pbits &= ~(PG_RW | PG_M); + if ((prot & VM_PROT_EXECUTE) == 0) + pbits |= pg_nx; + + if (pbits != obits) { + if (!atomic_cmpset_long(pdpe, obits, pbits)) + /* PG_PS cannot be cleared under us, */ + goto retry_pdpe; + anychanged = TRUE; + } + continue; + } + va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; @@ -5825,9 +5873,6 @@ for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { - pt_entry_t obits, pbits; - vm_page_t m; - retry: obits = pbits = *pte; if ((pbits & PG_V) == 0) @@ -6002,6 +6047,122 @@ } #endif /* VM_NRESERVLEVEL > 0 */ +static int +pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, + int psind) +{ + vm_page_t mp; + pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V; + vm_pindex_t ptepindex; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT(psind > 0 && psind < MAXPAGESIZES, + ("psind %d unexpected", psind)); + KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0, + ("unaligned phys address %#lx newpte %#lx psind %d", + newpte & PG_FRAME, newpte, psind)); + KASSERT((va & (pagesizes[psind] - 1)) == 0, + ("unaligned va %#lx psind %d", va, psind)); + KASSERT(va < VM_MAXUSER_ADDRESS, + ("kernel mode non-transparent superpage")); /* XXXKIB */ + KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS, + ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */ + + PG_V = pmap_valid_bit(pmap); + +restart: + pten = newpte; + if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) + pten |= pmap_pkru_get(pmap, va); + + ptepindex = pmap_pde_pindex(va); + + if (psind == 2) { /* 1G */ + if (!pmap_pkru_same(pmap, va, va + NBPDP)) + return (KERN_PROTECTION_FAILURE); + pml4e = pmap_pml4e(pmap, va); + if ((*pml4e & PG_V) == 0) { + mp = _pmap_allocpte(pmap, NUPDE + NUPDPE + + ((ptepindex - NUPDE) >> NPML4EPGSHIFT), NULL); + if (mp == NULL) { + if ((flags & PMAP_ENTER_NOSLEEP) != 0) + return (KERN_RESOURCE_SHORTAGE); + PMAP_UNLOCK(pmap); + vm_wait(NULL); + PMAP_LOCK(pmap); + + /* + * Restart at least to recalcuate the pkru + * key. Our caller must keep the map locked + * so no paging structure can be validated + * under us. + */ + goto restart; + } + pdpe = pmap_pdpe(pmap, va); + KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va)); + origpte = *pdpe; + MPASS(origpte == 0); + } else { + mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); + pdpe = pmap_pdpe(pmap, va); + KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va)); + origpte = *pdpe; + if ((origpte & PG_V) == 0) + mp->ref_count++; + } + KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 && + (origpte & PG_FRAME) == (newpte & PG_FRAME)), + ("va %#lx changing 1G phys page pdpe %#lx newpte %#lx", + va, origpte, newpte)); + if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) + pmap->pm_stats.wired_count += NBPDP / PAGE_SIZE; + else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) + pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE; + *pdpe = newpte; + } else /* (psind == 1) */ { /* 2M */ + if (!pmap_pkru_same(pmap, va, va + NBPDR)) + return (KERN_PROTECTION_FAILURE); + pde = pmap_pde(pmap, va); + if (pde == NULL) { + mp = _pmap_allocpte(pmap, NUPDE + + (ptepindex >> NPDPEPGSHIFT), NULL); + if (mp == NULL) { + if ((flags & PMAP_ENTER_NOSLEEP) != 0) + return (KERN_RESOURCE_SHORTAGE); + PMAP_UNLOCK(pmap); + vm_wait(NULL); + PMAP_LOCK(pmap); + goto restart; + } + pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); + pde = &pde[pmap_pde_index(va)]; + origpte = *pde; + MPASS(origpte == 0); + } else { + pdpe = pmap_pdpe(pmap, va); + MPASS(pdpe != NULL && (*pdpe & PG_V) != 0); + mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); + origpte = *pde; + if ((origpte & PG_V) == 0) + mp->ref_count++; + } + KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 && + (origpte & PG_FRAME) == (newpte & PG_FRAME)), + ("va %#lx changing 2M phys page pde %#lx newpte %#lx", + va, origpte, newpte)); + if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) + pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; + else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) + pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; + *pde = newpte; + } + if ((origpte & PG_V) == 0) + pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE); + + return (KERN_SUCCESS); +} + /* * Insert the given physical page (p) at * the specified virtual address (v) in the @@ -6081,6 +6242,13 @@ lock = NULL; PMAP_LOCK(pmap); + if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { + KASSERT((m->oflags & VPO_UNMANAGED) != 0, + ("managed largepage va %#lx flags %#x", va, flags)); + rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags, + psind); + goto out; + } if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); @@ -6766,9 +6934,10 @@ pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde; - pt_entry_t *pte, PG_V; + pt_entry_t *pte, PG_V, PG_G; PG_V = pmap_valid_bit(pmap); + PG_G = pmap_global_bit(pmap); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); @@ -6785,6 +6954,18 @@ va_next = eva; continue; } + KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva, + ("pmap_unwire of non-transient 1G page " + "pdpe %#lx sva %#lx eva %#lx va_next %#lx", + *pdpe, sva, eva, va_next)); + if ((*pdpe & PG_PS) != 0) { + MPASS(pmap != kernel_pmap); /* XXXKIB */ + MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); + atomic_clear_long(pdpe, PG_W); + pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE; + continue; + } + va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; @@ -6901,6 +7082,12 @@ } va_next = (addr + NBPDR) & ~PDRMASK; + KASSERT((*pdpe & PG_PS) == 0 || va_next <= end_addr, + ("pmap_copy of partial non-transient 1G page " + "pdpe %#lx sva %#lx eva %#lx va_next %#lx", + *pdpe, addr, end_addr, va_next)); + if ((*pdpe & PG_PS) != 0) + continue; if (va_next < addr) va_next = end_addr; @@ -7957,6 +8144,12 @@ va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; + KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva, + ("pmap_advise of non-transient 1G page " + "pdpe %#lx sva %#lx eva %#lx va_next %#lx", + *pdpe, sva, eva, va_next)); + if ((*pdpe & PG_PS) != 0) + continue; pde = pmap_pdpe_to_pde(pdpe, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) @@ -8718,6 +8911,7 @@ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { + pdp_entry_t *pdpe; pd_entry_t *pdep; pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa; @@ -8729,23 +8923,32 @@ PG_RW = pmap_rw_bit(pmap); PMAP_LOCK(pmap); - pdep = pmap_pde(pmap, addr); - if (pdep != NULL && (*pdep & PG_V)) { - if (*pdep & PG_PS) { - pte = *pdep; - /* Compute the physical address of the 4KB page. */ - pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & + pte = 0; + pa = 0; + val = 0; + pdpe = pmap_pdpe(pmap, addr); + if ((*pdpe & PG_V) != 0) { + if ((*pdpe & PG_PS) != 0) { + pte = *pdpe; + pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) & PG_FRAME; val = MINCORE_SUPER; } else { - pte = *pmap_pde_to_pte(pdep, addr); - pa = pte & PG_FRAME; - val = 0; + pdep = pmap_pde(pmap, addr); + if (pdep != NULL && (*pdep & PG_V) != 0) { + if ((*pdep & PG_PS) != 0) { + pte = *pdep; + /* Compute the physical address of the 4KB page. */ + pa = ((pte & PG_PS_FRAME) | (addr & + PDRMASK)) & PG_FRAME; + val = MINCORE_SUPER; + } else { + pte = *pmap_pde_to_pte(pdep, addr); + pa = pte & PG_FRAME; + val = 0; + } + } } - } else { - pte = 0; - pa = 0; - val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; Index: sys/dev/ksyms/ksyms.c =================================================================== --- sys/dev/ksyms/ksyms.c +++ sys/dev/ksyms/ksyms.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,8 @@ #include #include #include +#include +#include #include "linker_if.h" @@ -442,8 +445,8 @@ ksyms_size_calc(&ts); elfsz = sizeof(struct ksyms_hdr) + ts.ts_symsz + ts.ts_strsz; - object = vm_object_allocate(OBJT_PHYS, - OFF_TO_IDX(round_page(elfsz))); + object = vm_pager_allocate(OBJT_PHYS, NULL, round_page(elfsz), + VM_PROT_ALL, 0, td->td_ucred); sc->sc_obj = object; sc->sc_objsz = elfsz; Index: sys/dev/xen/gntdev/gntdev.c =================================================================== --- sys/dev/xen/gntdev/gntdev.c +++ sys/dev/xen/gntdev/gntdev.c @@ -1068,7 +1068,8 @@ vm_object_t mem_obj; struct gntdev_gref *gref; - mem_obj = vm_object_allocate(OBJT_PHYS, size); + mem_obj = vm_pager_allocate(OBJT_PHYS, NULL, size, VM_PROT_ALL, 0, + curthread->td_ucred); if (mem_obj == NULL) return (ENOMEM); Index: sys/kern/kern_umtx.c =================================================================== --- sys/kern/kern_umtx.c +++ sys/kern/kern_umtx.c @@ -3933,7 +3933,7 @@ reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO); reg->ushm_refcnt = 1; bcopy(key, ®->ushm_key, sizeof(*key)); - reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR); + reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR, false); reg->ushm_cred = crhold(cred); error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE); if (error != 0) { Index: sys/kern/link_elf.c =================================================================== --- sys/kern/link_elf.c +++ sys/kern/link_elf.c @@ -1107,7 +1107,8 @@ ef = (elf_file_t) lf; #ifdef SPARSE_MAPPING - ef->object = vm_object_allocate(OBJT_PHYS, atop(mapsize)); + ef->object = vm_pager_allocate(OBJT_PHYS, NULL, mapsize, VM_PROT_ALL, + 0, thread0.td_ucred); if (ef->object == NULL) { error = ENOMEM; goto out; Index: sys/kern/link_elf_obj.c =================================================================== --- sys/kern/link_elf_obj.c +++ sys/kern/link_elf_obj.c @@ -34,16 +34,17 @@ #include #include +#include #include #include #include +#include #include #include -#include #include -#include +#include +#include #include -#include #include @@ -53,11 +54,13 @@ #include #include -#include -#include -#include #include +#include +#include #include +#include +#include +#include #include @@ -905,7 +908,8 @@ * This stuff needs to be in a single chunk so that profiling etc * can get the bounds and gdb can associate offsets with modules */ - ef->object = vm_object_allocate(OBJT_PHYS, atop(round_page(mapsize))); + ef->object = vm_pager_allocate(OBJT_PHYS, NULL, round_page(mapsize), + VM_PROT_ALL, 0, thread0.td_ucred); if (ef->object == NULL) { error = ENOMEM; goto out; Index: sys/kern/uipc_shm.c =================================================================== --- sys/kern/uipc_shm.c +++ sys/kern/uipc_shm.c @@ -2,6 +2,7 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson + * Copyright 2020 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of @@ -9,6 +10,9 @@ * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -80,6 +84,7 @@ #include #include #include +#include #include #include #include @@ -120,6 +125,8 @@ static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); +static int shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, + void *rl_cookie); static int shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie); static int shm_copyin_path(struct thread *td, const char *userpath_in, @@ -159,11 +166,19 @@ .fo_get_seals = shm_get_seals, .fo_add_seals = shm_add_seals, .fo_fallocate = shm_fallocate, - .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE + .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE, }; FEATURE(posix_shm, "POSIX shared memory"); +static SYSCTL_NODE(_vm, OID_AUTO, largepages, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + ""); + +static int largepage_reclaim_tries = 1; +SYSCTL_INT(_vm_largepages, OID_AUTO, reclaim_tries, + CTLFLAG_RWTUN, &largepage_reclaim_tries, 0, + "Number of contig reclaims before giving up for default alloc policy"); + static int uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) { @@ -242,6 +257,89 @@ return (error); } +static u_long count_largepages[MAXPAGESIZES]; + +static int +shm_largepage_phys_populate(vm_object_t object, vm_pindex_t pidx, + int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) +{ + vm_page_t m; + int psind; + + psind = object->un_pager.phys.data_val; + if (psind == 0 || pidx >= object->size) + return (VM_PAGER_FAIL); + *first = rounddown2(pidx, pagesizes[psind] / PAGE_SIZE); + + /* + * We only busy the first page in the superpage run. It is + * useless to busy whole run since we only remove full + * superpage, and it takes too long to busy e.g. 512 * 512 == + * 262144 pages constituing 1G amd64 superage. + */ + m = vm_page_grab(object, *first, VM_ALLOC_NORMAL | VM_ALLOC_NOCREAT); + MPASS(m != NULL); + + *last = *first + atop(pagesizes[psind]) - 1; + return (VM_PAGER_OK); +} + +static boolean_t +shm_largepage_phys_haspage(vm_object_t object, vm_pindex_t pindex, + int *before, int *after) +{ + int psind; + + psind = object->un_pager.phys.data_val; + if (psind == 0 || pindex >= object->size) + return (FALSE); + if (before != NULL) { + *before = pindex - rounddown2(pindex, pagesizes[psind] / + PAGE_SIZE); + } + if (after != NULL) { + *after = roundup2(pindex, pagesizes[psind] / PAGE_SIZE) - + pindex; + } + return (TRUE); +} + +static void +shm_largepage_phys_ctor(vm_object_t object, vm_prot_t prot, + vm_ooffset_t foff, struct ucred *cred) +{ +} + +static void +shm_largepage_phys_dtor(vm_object_t object) +{ + int psind; + + psind = object->un_pager.phys.data_val; + if (psind != 0) { + atomic_subtract_long(&count_largepages[psind], + object->size / (pagesizes[psind] / PAGE_SIZE)); + vm_wire_sub(object->size); + } else { + KASSERT(object->size == 0, + ("largepage phys obj %p not initialized bit size %#jx > 0", + object, (uintmax_t)object->size)); + } +} + +static struct phys_pager_ops shm_largepage_phys_ops = { + .phys_pg_populate = shm_largepage_phys_populate, + .phys_pg_haspage = shm_largepage_phys_haspage, + .phys_pg_ctor = shm_largepage_phys_ctor, + .phys_pg_dtor = shm_largepage_phys_dtor, +}; + +static inline bool +shm_largepage(struct shmfd *shmfd) +{ + return (shmfd->shm_object->type == OBJT_PHYS); +} + static int shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) { @@ -321,6 +419,8 @@ if (error) return (error); #endif + if (shm_largepage(shmfd) && shmfd->shm_lp_psind == 0) + return (EINVAL); foffset_lock_uio(fp, uio, flags); if (uio->uio_resid > OFF_MAX - uio->uio_offset) { /* @@ -385,7 +485,11 @@ shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { + struct shmfd *shmfd; + struct shm_largepage_conf *conf; + void *rl_cookie; + shmfd = fp->f_data; switch (com) { case FIONBIO: case FIOASYNC: @@ -394,6 +498,28 @@ * just like it would on an unlinked regular file */ return (0); + case FIOSHMLPGCNF: + if (!shm_largepage(shmfd)) + return (ENOTTY); + conf = data; + if (shmfd->shm_lp_psind != 0 && + conf->psind != shmfd->shm_lp_psind) + return (EINVAL); + if (conf->psind <= 0 || conf->psind >= MAXPAGESIZES || + pagesizes[conf->psind] == 0) + return (EINVAL); + if (conf->alloc_policy != SHM_LARGEPAGE_ALLOC_DEFAULT && + conf->alloc_policy != SHM_LARGEPAGE_ALLOC_NOWAIT && + conf->alloc_policy != SHM_LARGEPAGE_ALLOC_HARD) + return (EINVAL); + + rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, + &shmfd->shm_mtx); + shmfd->shm_lp_psind = conf->psind; + shmfd->shm_lp_alloc_policy = conf->alloc_policy; + shmfd->shm_object->un_pager.phys.data_val = conf->psind; + rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); + return (0); default: return (ENOTTY); } @@ -436,6 +562,8 @@ sb->st_dev = shm_dev_ino; sb->st_ino = shmfd->shm_ino; sb->st_nlink = shmfd->shm_object->ref_count; + sb->st_blocks = shmfd->shm_object->size / + (pagesizes[shmfd->shm_lp_psind] >> PAGE_SHIFT); return (0); } @@ -592,6 +720,103 @@ return (0); } +static int +shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie) +{ + vm_object_t object; + vm_page_t m; + vm_pindex_t newobjsz, oldobjsz; + int aflags, error, i, psind, try; + + KASSERT(length >= 0, ("shm_dotruncate: length < 0")); + object = shmfd->shm_object; + VM_OBJECT_ASSERT_WLOCKED(object); + rangelock_cookie_assert(rl_cookie, RA_WLOCKED); + + oldobjsz = object->size; + newobjsz = OFF_TO_IDX(length); + if (length == shmfd->shm_size) + return (0); + psind = shmfd->shm_lp_psind; + if (psind == 0 && length != 0) + return (EINVAL); + if ((length & (pagesizes[psind] - 1)) != 0) + return (EINVAL); + + if (length < shmfd->shm_size) { + if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0) + return (EPERM); + if (shmfd->shm_kmappings > 0) + return (EBUSY); + return (ENOTSUP); /* Pages are unmanaged. */ +#if 0 + vm_object_page_remove(object, newobjsz, oldobjsz, 0); + object->size = newobjsz; + shmfd->shm_size = length; + return (0); +#endif + } + + aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO; + if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT) + aflags |= VM_ALLOC_WAITFAIL; + try = 0; + + /* + * Extend shmfd and object, keeping all already fully + * allocated large pages intact even on error, because dropped + * object lock might allowed mapping of them. + */ + while (object->size < newobjsz) { + m = vm_page_alloc_contig(object, object->size, aflags, + pagesizes[psind] / PAGE_SIZE, 0, ~0, + pagesizes[psind], 0, + VM_MEMATTR_DEFAULT); + if (m == NULL) { + VM_OBJECT_WUNLOCK(object); + if (shmfd->shm_lp_alloc_policy == + SHM_LARGEPAGE_ALLOC_NOWAIT || + (shmfd->shm_lp_alloc_policy == + SHM_LARGEPAGE_ALLOC_DEFAULT && + try >= largepage_reclaim_tries)) + return (ENOMEM); + error = vm_page_reclaim_contig(aflags, + pagesizes[psind] / PAGE_SIZE, 0, ~0, + pagesizes[psind], 0) ? 0 : + vm_wait_intr(object); + if (error != 0) + return (error); + try++; + VM_OBJECT_WLOCK(object); + continue; + } + try = 0; + for (i = 0; i < pagesizes[psind] / PAGE_SIZE; i++) { + if ((m[i].flags & PG_ZERO) == 0) + pmap_zero_page(&m[i]); + vm_page_valid(&m[i]); + vm_page_xunbusy(&m[i]); + } + object->size += OFF_TO_IDX(pagesizes[psind]); + shmfd->shm_size += pagesizes[psind]; + atomic_add_long(&count_largepages[psind], 1); + vm_wire_add(atop(pagesizes[psind])); + } + return (0); +} + +static int +shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, void *rl_cookie) +{ + int error; + + VM_OBJECT_WLOCK(shmfd->shm_object); + error = shm_largepage(shmfd) ? shm_dotruncate_largepage(shmfd, length, + rl_cookie) : shm_dotruncate_locked(shmfd, length, rl_cookie); + VM_OBJECT_WUNLOCK(shmfd->shm_object); + return (error); +} + int shm_dotruncate(struct shmfd *shmfd, off_t length) { @@ -600,9 +825,7 @@ rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); - VM_OBJECT_WLOCK(shmfd->shm_object); - error = shm_dotruncate_locked(shmfd, length, rl_cookie); - VM_OBJECT_WUNLOCK(shmfd->shm_object); + error = shm_dotruncate_cookie(shmfd, length, rl_cookie); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); return (error); } @@ -612,7 +835,7 @@ * routines. */ struct shmfd * -shm_alloc(struct ucred *ucred, mode_t mode) +shm_alloc(struct ucred *ucred, mode_t mode, bool largepage) { struct shmfd *shmfd; @@ -621,8 +844,15 @@ shmfd->shm_uid = ucred->cr_uid; shmfd->shm_gid = ucred->cr_gid; shmfd->shm_mode = mode; - shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL, - shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); + if (largepage) { + shmfd->shm_object = phys_pager_allocate(NULL, + &shm_largepage_phys_ops, NULL, shmfd->shm_size, + VM_PROT_DEFAULT, 0, ucred); + shmfd->shm_lp_alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT; + } else { + shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL, + shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); + } KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); vfs_timestamp(&shmfd->shm_birthtime); shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = @@ -684,14 +914,11 @@ return (error); } -/* - * Dictionary management. We maintain an in-kernel dictionary to map - * paths to shmfd objects. We use the FNV hash on the path to store - * the mappings in a hash table. - */ static void shm_init(void *arg) { + char name[32]; + int i; mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); sx_init(&shm_dict_lock, "shm dictionary"); @@ -699,9 +926,32 @@ new_unrhdr64(&shm_ino_unr, 1); shm_dev_ino = devfs_alloc_cdp_inode(); KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); + + for (i = 1; i < MAXPAGESIZES; i++) { + if (pagesizes[i] == 0) + break; +#define M (1024 * 1024) +#define G (1024 * M) + if (pagesizes[i] >= G) + snprintf(name, sizeof(name), "%luG", pagesizes[i] / G); + else if (pagesizes[i] >= M) + snprintf(name, sizeof(name), "%luM", pagesizes[i] / M); + else + snprintf(name, sizeof(name), "%lu", pagesizes[i]); +#undef G +#undef M + SYSCTL_ADD_ULONG(NULL, SYSCTL_STATIC_CHILDREN(_vm_largepages), + OID_AUTO, name, CTLFLAG_RD, &count_largepages[i], + "number of non-transient largepages allocated"); + } } SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); +/* + * Dictionary management. We maintain an in-kernel dictionary to map + * paths to shmfd objects. We use the FNV hash on the path to store + * the mappings in a hash table. + */ static struct shmfd * shm_lookup(char *path, Fnv32_t fnv) { @@ -773,8 +1023,10 @@ Fnv32_t fnv; mode_t cmode; int error, fd, initial_seals; + bool largepage; - if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE)) != 0) + if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE | + SHM_LARGEPAGE)) != 0) return (EINVAL); initial_seals = F_SEAL_SEAL; @@ -798,6 +1050,8 @@ if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) return (EINVAL); + largepage = (shmflags & SHM_LARGEPAGE) != 0; + /* * Currently only F_SEAL_SEAL may be set when creating or opening shmfd. * If the decision is made later to allow additional seals, care must be @@ -831,7 +1085,7 @@ fdrop(fp, td); return (EINVAL); } - shmfd = shm_alloc(td->td_ucred, cmode); + shmfd = shm_alloc(td->td_ucred, cmode, largepage); shmfd->shm_seals = initial_seals; } else { error = shm_copyin_path(td, userpath, &path); @@ -853,7 +1107,8 @@ path); if (error == 0) { #endif - shmfd = shm_alloc(td->td_ucred, cmode); + shmfd = shm_alloc(td->td_ucred, cmode, + largepage); shmfd->shm_seals = initial_seals; shm_insert(path, fnv, shmfd); #ifdef MAC @@ -1136,7 +1391,118 @@ return (error); } -int +static int +shm_mmap_large(struct shmfd *shmfd, vm_map_t map, vm_offset_t *addr, + vm_size_t size, vm_prot_t prot, vm_prot_t max_prot, int flags, + vm_ooffset_t foff, bool writecounted, struct thread *td) +{ + struct vmspace *vms; + vm_map_entry_t next_entry, prev_entry; + vm_offset_t align, mask, maxaddr; + int docow, error, rv, try; + bool curmap; + + if (shmfd->shm_lp_psind == 0) + return (EINVAL); + + /* MAP_PRIVATE is disabled */ + if ((flags & ~(MAP_SHARED | MAP_FIXED | MAP_EXCL | + MAP_NOCORE | +#ifdef MAP_32BIT + MAP_32BIT | +#endif + MAP_ALIGNMENT_MASK)) != 0) + return (EINVAL); + + vms = td->td_proc->p_vmspace; + curmap = map == &vms->vm_map; + if (curmap) { + error = kern_mmap_racct_check(td, map, size); + if (error != 0) + return (error); + } + + docow = shmfd->shm_lp_psind << MAP_SPLIT_BOUNDARY_SHIFT; + docow |= MAP_INHERIT_SHARE; + if ((flags & MAP_NOCORE) != 0) + docow |= MAP_DISABLE_COREDUMP; + if (writecounted) + docow |= MAP_WRITECOUNT; + + mask = pagesizes[shmfd->shm_lp_psind] - 1; + if ((foff & mask) != 0) + return (EINVAL); + maxaddr = vm_map_max(map); +#ifdef MAP_32BIT + if ((flags & MAP_32BIT) != 0 && maxaddr > MAP_32BIT_MAX_ADDR) + maxaddr = MAP_32BIT_MAX_ADDR; +#endif + if (size == 0 || (size & mask) != 0 || + (*addr != 0 && ((*addr & mask) != 0 || + *addr + size < *addr || *addr + size > maxaddr))) + return (EINVAL); + + align = flags & MAP_ALIGNMENT_MASK; + if (align == 0) { + align = pagesizes[shmfd->shm_lp_psind]; + } else if (align == MAP_ALIGNED_SUPER) { + if (shmfd->shm_lp_psind != 1) + return (EINVAL); + align = pagesizes[1]; + } else { + align >>= MAP_ALIGNMENT_SHIFT; + align = 1ULL << align; + /* Also handles overflow. */ + if (align < pagesizes[shmfd->shm_lp_psind]) + return (EINVAL); + } + + vm_map_lock(map); + if ((flags & MAP_FIXED) == 0) { + try = 1; + if (curmap && (*addr == 0 || + (*addr >= round_page((vm_offset_t)vms->vm_taddr) && + *addr < round_page((vm_offset_t)vms->vm_daddr + + lim_max(td, RLIMIT_DATA))))) { + *addr = roundup2((vm_offset_t)vms->vm_daddr + + lim_max(td, RLIMIT_DATA), + pagesizes[shmfd->shm_lp_psind]); + } +again: + rv = vm_map_find_aligned(map, addr, size, maxaddr, align); + if (rv != KERN_SUCCESS) { + if (try == 1) { + try = 2; + *addr = vm_map_min(map); + if ((*addr & mask) != 0) + *addr = (*addr + mask) & mask; + goto again; + } + goto fail1; + } + } else if ((flags & MAP_EXCL) == 0) { + rv = vm_map_delete(map, *addr, *addr + size); + if (rv != KERN_SUCCESS) + goto fail1; + } else { + error = ENOSPC; + if (vm_map_lookup_entry(map, *addr, &prev_entry)) + goto fail; + next_entry = vm_map_entry_succ(prev_entry); + if (next_entry->start < *addr + size) + goto fail; + } + + rv = vm_map_insert(map, shmfd->shm_object, foff, *addr, *addr + size, + prot, max_prot, docow); +fail1: + error = vm_mmap_to_errno(rv); +fail: + vm_map_unlock(map); + return (error); +} + +static int shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) @@ -1208,8 +1574,13 @@ if (writecnt) vm_pager_update_writecount(shmfd->shm_object, 0, objsize); - error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, - shmfd->shm_object, foff, writecnt, td); + if (shm_largepage(shmfd)) { + error = shm_mmap_large(shmfd, map, addr, objsize, prot, + maxprot, flags, foff, writecnt, td); + } else { + error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, + shmfd->shm_object, foff, writecnt, td); + } if (error != 0) { if (writecnt) vm_pager_release_writecount(shmfd->shm_object, 0, @@ -1500,11 +1871,8 @@ */ rl_cookie = rangelock_wlock(&shmfd->shm_rl, offset, size, &shmfd->shm_mtx); - if (size > shmfd->shm_size) { - VM_OBJECT_WLOCK(shmfd->shm_object); - error = shm_dotruncate_locked(shmfd, size, rl_cookie); - VM_OBJECT_WUNLOCK(shmfd->shm_object); - } + if (size > shmfd->shm_size) + error = shm_dotruncate_cookie(shmfd, size, rl_cookie); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); /* Translate to posix_fallocate(2) return value as needed. */ if (error == ENOMEM) Index: sys/sys/filio.h =================================================================== --- sys/sys/filio.h +++ sys/sys/filio.h @@ -70,6 +70,7 @@ }; /* Get the file's bmap info for the logical block bn. */ #define FIOBMAP2 _IOWR('f', 99, struct fiobmap2_arg) +#define FIOSHMLPGCNF _IOW('f', 100, struct shm_largepage_conf) #ifdef _KERNEL #ifdef COMPAT_FREEBSD32 Index: sys/sys/mman.h =================================================================== --- sys/sys/mman.h +++ sys/sys/mman.h @@ -191,6 +191,17 @@ */ #define SHM_ALLOW_SEALING 0x00000001 #define SHM_GROW_ON_WRITE 0x00000002 +#define SHM_LARGEPAGE 0x00000004 + +#define SHM_LARGEPAGE_ALLOC_DEFAULT 0 +#define SHM_LARGEPAGE_ALLOC_NOWAIT 1 +#define SHM_LARGEPAGE_ALLOC_HARD 2 + +struct shm_largepage_conf { + int psind; + int alloc_policy; + int pad[10]; +}; /* * Flags for memfd_create(). @@ -198,7 +209,6 @@ #define MFD_CLOEXEC 0x00000001 #define MFD_ALLOW_SEALING 0x00000002 -/* UNSUPPORTED */ #define MFD_HUGETLB 0x00000004 #define MFD_HUGE_MASK 0xFC000000 @@ -281,6 +291,10 @@ int shm_flags; int shm_seals; + + /* largepage config */ + int shm_lp_psind; + int shm_lp_alloc_policy; }; #endif @@ -289,12 +303,15 @@ int shm_unmap(struct file *fp, void *mem, size_t size); int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags); -struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode); +struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode, bool largepage); struct shmfd *shm_hold(struct shmfd *shmfd); void shm_drop(struct shmfd *shmfd); int shm_dotruncate(struct shmfd *shmfd, off_t length); extern struct fileops shm_ops; + +#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) + #else /* !_KERNEL */ __BEGIN_DECLS @@ -328,6 +345,7 @@ #endif #if __BSD_VISIBLE int memfd_create(const char *, unsigned int); +int shm_open_largepage(const char *, int, int, int, mode_t); int shm_rename(const char *, const char *, int); #endif __END_DECLS Index: sys/sys/syscallsubr.h =================================================================== --- sys/sys/syscallsubr.h +++ sys/sys/syscallsubr.h @@ -62,6 +62,7 @@ struct stat; struct thr_param; struct uio; +struct vm_map; typedef int (*mmap_check_fp_fn)(struct file *, int, int, int); @@ -197,8 +198,10 @@ size_t len); int kern_mmap(struct thread *td, uintptr_t addr, size_t len, int prot, int flags, int fd, off_t pos); -int kern_mmap_req(struct thread *td, const struct mmap_req *mrp); +int kern_mmap_racct_check(struct thread *td, struct vm_map *map, + vm_size_t size); int kern_mmap_maxprot(struct proc *p, int prot); +int kern_mmap_req(struct thread *td, const struct mmap_req *mrp); int kern_mprotect(struct thread *td, uintptr_t addr, size_t size, int prot); int kern_msgctl(struct thread *, int, int, struct msqid_ds *); int kern_msgrcv(struct thread *, int, void *, size_t, long, int, long *); Index: sys/vm/phys_pager.c =================================================================== --- sys/vm/phys_pager.c +++ sys/vm/phys_pager.c @@ -51,6 +51,20 @@ /* protect access to phys_pager_object_list */ static struct mtx phys_pager_mtx; +static int default_phys_pager_getpages(vm_object_t object, vm_page_t *m, + int count, int *rbehind, int *rahead); +static int default_phys_pager_populate(vm_object_t object, vm_pindex_t pidx, + int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last); +static boolean_t default_phys_pager_haspage(vm_object_t object, + vm_pindex_t pindex, int *before, int *after); +struct phys_pager_ops default_phys_pg_ops = { + .phys_pg_getpages = default_phys_pager_getpages, + .phys_pg_populate = default_phys_pager_populate, + .phys_pg_haspage = default_phys_pager_haspage, + .phys_pg_ctor = NULL, + .phys_pg_dtor = NULL, +}; + static void phys_pager_init(void) { @@ -59,12 +73,13 @@ mtx_init(&phys_pager_mtx, "phys_pager list", NULL, MTX_DEF); } -static vm_object_t -phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, - vm_ooffset_t foff, struct ucred *cred) +vm_object_t +phys_pager_allocate(void *handle, struct phys_pager_ops *ops, void *data, + vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred) { vm_object_t object, object1; vm_pindex_t pindex; + bool init; /* * Offset should be page aligned. @@ -73,6 +88,7 @@ return (NULL); pindex = OFF_TO_IDX(foff + PAGE_MASK + size); + init = true; if (handle != NULL) { mtx_lock(&phys_pager_mtx); @@ -97,11 +113,15 @@ */ if (pindex > object->size) object->size = pindex; + init = false; } else { object = object1; object1 = NULL; object->handle = handle; - vm_object_set_flag(object, OBJ_POPULATE); + object->un_pager.phys.ops = ops; + object->un_pager.phys.data_ptr = data; + if (ops->phys_pg_populate != NULL) + vm_object_set_flag(object, OBJ_POPULATE); TAILQ_INSERT_TAIL(&phys_pager_object_list, object, pager_object_list); } @@ -113,12 +133,25 @@ vm_object_deallocate(object1); } else { object = vm_object_allocate(OBJT_PHYS, pindex); - vm_object_set_flag(object, OBJ_POPULATE); + object->un_pager.phys.ops = ops; + object->un_pager.phys.data_ptr = data; + if (ops->phys_pg_populate != NULL) + vm_object_set_flag(object, OBJ_POPULATE); } + if (init && ops->phys_pg_ctor != NULL) + ops->phys_pg_ctor(object, prot, foff, cred); return (object); } +static vm_object_t +phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, + vm_ooffset_t foff, struct ucred *ucred) +{ + return (phys_pager_allocate(handle, &default_phys_pg_ops, NULL, + size, prot, foff, ucred)); +} + static void phys_pager_dealloc(vm_object_t object) { @@ -130,16 +163,18 @@ mtx_unlock(&phys_pager_mtx); VM_OBJECT_WLOCK(object); } - object->handle = NULL; object->type = OBJT_DEAD; + if (object->un_pager.phys.ops->phys_pg_dtor != NULL) + object->un_pager.phys.ops->phys_pg_dtor(object); + object->handle = NULL; } /* * Fill as many pages as vm_fault has allocated for us. */ static int -phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, - int *rahead) +default_phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, + int *rbehind, int *rahead) { int i; @@ -161,6 +196,14 @@ return (VM_PAGER_OK); } +static int +phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, + int *rahead) +{ + return (object->un_pager.phys.ops->phys_pg_getpages(object, m, + count, rbehind, rahead)); +} + /* * Implement a pretty aggressive clustered getpages strategy. Hint that * everything in an entire 4MB window should be prefaulted at once. @@ -185,7 +228,7 @@ #define PHYSALLOC 16 static int -phys_pager_populate(vm_object_t object, vm_pindex_t pidx, +default_phys_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type __unused, vm_prot_t max_prot __unused, vm_pindex_t *first, vm_pindex_t *last) { @@ -216,6 +259,14 @@ return (VM_PAGER_OK); } +static int +phys_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type, + vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) +{ + return (object->un_pager.phys.ops->phys_pg_populate(object, pidx, + fault_type, max_prot, first, last)); +} + static void phys_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync, int *rtvals) @@ -225,7 +276,7 @@ } static boolean_t -phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, +default_phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { vm_pindex_t base, end; @@ -239,6 +290,14 @@ return (TRUE); } +static boolean_t +phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, + int *after) +{ + return (object->un_pager.phys.ops->phys_pg_haspage(object, pindex, + before, after)); +} + struct pagerops physpagerops = { .pgo_init = phys_pager_init, .pgo_alloc = phys_pager_alloc, Index: sys/vm/pmap.h =================================================================== --- sys/vm/pmap.h +++ sys/vm/pmap.h @@ -106,6 +106,7 @@ */ #define PMAP_ENTER_NOSLEEP 0x00000100 #define PMAP_ENTER_WIRED 0x00000200 +#define PMAP_ENTER_LARGEPAGE 0x00000400 #define PMAP_ENTER_RESERVED 0xFF000000 /* @@ -171,5 +172,8 @@ #define pmap_resident_count(pm) ((pm)->pm_stats.resident_count) #define pmap_wired_count(pm) ((pm)->pm_stats.wired_count) +extern u_long pmap_superpagesize[]; +extern u_int pmap_superpagesize_nitems; + #endif /* _KERNEL */ #endif /* _PMAP_VM_ */ Index: sys/vm/uma_core.c =================================================================== --- sys/vm/uma_core.c +++ sys/vm/uma_core.c @@ -3612,7 +3612,7 @@ break; if (rr && vm_domainset_iter_policy(&di, &domain) != 0) { if ((flags & M_WAITOK) != 0) { - vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask); + vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0); goto restart; } break; @@ -4754,7 +4754,7 @@ break; } if (vm_domainset_iter_policy(&di, &domain) != 0) - vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask); + vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0); } } } Index: sys/vm/vm_domainset.h =================================================================== --- sys/vm/vm_domainset.h +++ sys/vm/vm_domainset.h @@ -50,6 +50,6 @@ void vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *, struct domainset_ref *, int *, int *); -void vm_wait_doms(const domainset_t *); +int vm_wait_doms(const domainset_t *, int mflags); #endif /* __VM_DOMAINSET_H__ */ Index: sys/vm/vm_domainset.c =================================================================== --- sys/vm/vm_domainset.c +++ sys/vm/vm_domainset.c @@ -245,7 +245,7 @@ /* Wait for one of the domains to accumulate some free pages. */ if (obj != NULL) VM_OBJECT_WUNLOCK(obj); - vm_wait_doms(&di->di_domain->ds_mask); + vm_wait_doms(&di->di_domain->ds_mask, 0); if (obj != NULL) VM_OBJECT_WLOCK(obj); if ((di->di_flags & VM_ALLOC_WAITFAIL) != 0) @@ -310,7 +310,7 @@ return (ENOMEM); /* Wait for one of the domains to accumulate some free pages. */ - vm_wait_doms(&di->di_domain->ds_mask); + vm_wait_doms(&di->di_domain->ds_mask, 0); /* Restart the search. */ vm_domainset_iter_first(di, domain); Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -420,7 +420,7 @@ vm_offset_t vaddr; vm_page_t m; vm_pindex_t map_first, map_last, pager_first, pager_last, pidx; - int i, npages, psind, rv; + int bdry_idx, i, npages, psind, rv; MPASS(fs->object == fs->first_object); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); @@ -442,7 +442,8 @@ * to the driver. */ rv = vm_pager_populate(fs->first_object, fs->first_pindex, - fs->fault_type, fs->entry->max_protection, &pager_first, &pager_last); + fs->fault_type, fs->entry->max_protection, &pager_first, + &pager_last); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); if (rv == VM_PAGER_BAD) { @@ -465,15 +466,49 @@ MPASS(pager_last < fs->first_object->size); vm_fault_restore_map_lock(fs); + bdry_idx = (fs->entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> + MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; if (fs->map->timestamp != fs->map_generation) { - vm_fault_populate_cleanup(fs->first_object, pager_first, - pager_last); + if (bdry_idx == 0) { + vm_fault_populate_cleanup(fs->first_object, pager_first, + pager_last); + } else { + m = vm_page_lookup(fs->first_object, pager_first); + if (m != fs->m) + vm_page_xunbusy(m); + } return (KERN_RESTART); } /* * The map is unchanged after our last unlock. Process the fault. * + * First, the special case of largepage mappings, where + * populate only busies the first page in superpage run. + */ + if (bdry_idx != 0) { + m = vm_page_lookup(fs->first_object, pager_first); + vm_fault_populate_check_page(m); + VM_OBJECT_WUNLOCK(fs->first_object); + vaddr = fs->entry->start + IDX_TO_OFF(pager_first) - + fs->entry->offset; + /* assert alignment for entry */ + KASSERT((vaddr & (pagesizes[bdry_idx] - 1)) == 0, + ("unaligned superpage start %#jx pager_first %#jx offset %#jx vaddr %#jx", + (uintmax_t)fs->entry->start, (uintmax_t)pager_first, + (uintmax_t)fs->entry->offset, (uintmax_t)vaddr)); + KASSERT((VM_PAGE_TO_PHYS(m) & (pagesizes[bdry_idx] - 1)) == 0, + ("unaligned superpage m %p %#jx", m, + (uintmax_t)VM_PAGE_TO_PHYS(m))); + rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot, + fs->fault_type | (fs->wired ? PMAP_ENTER_WIRED : 0) | + PMAP_ENTER_LARGEPAGE, bdry_idx); + VM_OBJECT_WLOCK(fs->first_object); + vm_page_xunbusy(m); + goto out; + } + + /* * The range [pager_first, pager_last] that is given to the * pager is only a hint. The pager may populate any range * within the object that includes the requested page index. @@ -539,6 +574,7 @@ vm_page_xunbusy(&m[i]); } } +out: curthread->td_ru.ru_majflt++; return (KERN_SUCCESS); } @@ -1255,6 +1291,7 @@ * multiple page faults of a similar type to run in parallel. */ if (fs.vp == NULL /* avoid locked vnode leak */ && + (fs.entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) == 0 && (fs.fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0) { VM_OBJECT_RLOCK(fs.first_object); rv = vm_fault_soft_fast(&fs); @@ -1287,6 +1324,27 @@ */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; + + if ((fs.entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) != 0) { + rv = vm_fault_allocate(&fs); + switch (rv) { + case KERN_RESTART: + unlock_and_deallocate(&fs); + /* FALLTHROUGH */ + case KERN_RESOURCE_SHORTAGE: + goto RetryFault; + case KERN_SUCCESS: + case KERN_FAILURE: + case KERN_OUT_OF_BOUNDS: + unlock_and_deallocate(&fs); + return (rv); + case KERN_NOT_RECEIVER: + break; + default: + panic("vm_fault: Unhandled rv %d", rv); + } + } + while (TRUE) { KASSERT(fs.m == NULL, ("page still set %p at loop start", fs.m)); Index: sys/vm/vm_glue.c =================================================================== --- sys/vm/vm_glue.c +++ sys/vm/vm_glue.c @@ -565,7 +565,7 @@ } dset = td2->td_domain.dr_policy; while (vm_page_count_severe_set(&dset->ds_mask)) { - vm_wait_doms(&dset->ds_mask); + vm_wait_doms(&dset->ds_mask, 0); } if ((flags & RFMEM) == 0) { Index: sys/vm/vm_map.h =================================================================== --- sys/vm/vm_map.h +++ sys/vm/vm_map.h @@ -149,6 +149,10 @@ #define MAP_ENTRY_STACK_GAP_UP 0x00040000 #define MAP_ENTRY_HEADER 0x00080000 +#define MAP_ENTRY_SPLIT_BOUNDARY_MASK 0x00300000 + +#define MAP_ENTRY_SPLIT_BOUNDARY_SHIFT 20 + #ifdef _KERNEL static __inline u_char vm_map_entry_behavior(vm_map_entry_t entry) @@ -377,6 +381,9 @@ #define MAP_CREATE_STACK_GAP_UP 0x00010000 #define MAP_CREATE_STACK_GAP_DN 0x00020000 #define MAP_VN_EXEC 0x00040000 +#define MAP_SPLIT_BOUNDARY_MASK 0x00180000 + +#define MAP_SPLIT_BOUNDARY_SHIFT 19 /* * vm_fault option flags @@ -465,6 +472,8 @@ vm_offset_t, int, vm_prot_t, vm_prot_t, int); int vm_map_find_min(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, vm_offset_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int); +int vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length, + vm_offset_t max_addr, vm_offset_t alignment); int vm_map_fixed(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int); vm_offset_t vm_map_findspace(vm_map_t, vm_offset_t, vm_size_t); Index: sys/vm/vm_map.c =================================================================== --- sys/vm/vm_map.c +++ sys/vm/vm_map.c @@ -1603,13 +1603,17 @@ struct ucred *cred; vm_eflags_t protoeflags; vm_inherit_t inheritance; + u_long bdry; + u_int bidx; VM_MAP_ASSERT_LOCKED(map); KASSERT(object != kernel_object || (cow & MAP_COPY_ON_WRITE) == 0, ("vm_map_insert: kernel object and COW")); - KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0, - ("vm_map_insert: paradoxical MAP_NOFAULT request")); + KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 || + (cow & MAP_SPLIT_BOUNDARY_MASK) != 0, + ("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x", + object, cow)); KASSERT((prot & ~max) == 0, ("prot %#x is not subset of max_prot %#x", prot, max)); @@ -1664,6 +1668,17 @@ inheritance = VM_INHERIT_SHARE; else inheritance = VM_INHERIT_DEFAULT; + if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) { + /* This magically ignores index 0, for usual page size. */ + bidx = (cow & MAP_SPLIT_BOUNDARY_MASK) >> + MAP_SPLIT_BOUNDARY_SHIFT; + if (bidx >= MAXPAGESIZES) + return (KERN_INVALID_ARGUMENT); + bdry = pagesizes[bidx] - 1; + if ((start & bdry) != 0 || (end & bdry) != 0) + return (KERN_INVALID_ARGUMENT); + protoeflags |= bidx << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; + } cred = NULL; if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0) @@ -1917,8 +1932,11 @@ ("vm_map_fixed: non-NULL backing object for stack")); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); - if ((cow & MAP_CHECK_EXCL) == 0) - vm_map_delete(map, start, end); + if ((cow & MAP_CHECK_EXCL) == 0) { + result = vm_map_delete(map, start, end); + if (result != KERN_SUCCESS) + goto out; + } if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { result = vm_map_stack_locked(map, start, length, sgrowsiz, prot, max, cow); @@ -1926,6 +1944,7 @@ result = vm_map_insert(map, object, offset, start, end, prot, max, cow); } +out: vm_map_unlock(map); return (result); } @@ -1958,8 +1977,6 @@ &aslr_restarts, 0, "Number of aslr failures"); -#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) - /* * Searches for the specified amount of free space in the given map with the * specified alignment. Performs an address-ordered, first-fit search from @@ -2027,6 +2044,19 @@ } } +int +vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length, + vm_offset_t max_addr, vm_offset_t alignment) +{ + /* XXXKIB ASLR eh ? */ + *addr = vm_map_findspace(map, *addr, length); + if (*addr + length > vm_map_max(map) || + (max_addr != 0 && *addr + length > max_addr)) + return (KERN_NO_SPACE); + return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr, + alignment)); +} + /* * vm_map_find finds an unallocated region in the target address * map with the given length. The search is defined to be @@ -2164,7 +2194,9 @@ rv = KERN_INVALID_ADDRESS; goto done; } - vm_map_delete(map, *addr, *addr + length); + rv = vm_map_delete(map, *addr, *addr + length); + if (rv != KERN_SUCCESS) + goto done; } if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot, @@ -2374,31 +2406,40 @@ * the specified address; if necessary, * it splits the entry into two. */ -static inline void -vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start) +static int +vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr) { vm_map_entry_t new_entry; + int bdry_idx; if (!map->system_map) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "%s: map %p entry %p start 0x%jx", __func__, map, entry, - (uintmax_t)start); + (uintmax_t)startaddr); - if (start <= entry->start) - return; + if (startaddr <= entry->start) + return (KERN_SUCCESS); VM_MAP_ASSERT_LOCKED(map); - KASSERT(entry->end > start && entry->start < start, + KASSERT(entry->end > startaddr && entry->start < startaddr, ("%s: invalid clip of entry %p", __func__, entry)); + bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> + MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; + if (bdry_idx != 0) { + if ((startaddr & (pagesizes[bdry_idx] - 1)) != 0) + return (KERN_INVALID_ARGUMENT); + } + new_entry = vm_map_entry_clone(map, entry); /* * Split off the front portion. Insert the new entry BEFORE this one, * so that this entry has the specified starting address. */ - new_entry->end = start; + new_entry->end = startaddr; vm_map_entry_link(map, new_entry); + return (KERN_SUCCESS); } /* @@ -2408,11 +2449,12 @@ * the interior of the entry. Return entry after 'start', and in * prev_entry set the entry before 'start'. */ -static inline vm_map_entry_t +static int vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start, - vm_map_entry_t *prev_entry) + vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry) { vm_map_entry_t entry; + int rv; if (!map->system_map) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, @@ -2421,11 +2463,14 @@ if (vm_map_lookup_entry(map, start, prev_entry)) { entry = *prev_entry; - vm_map_clip_start(map, entry, start); + rv = vm_map_clip_start(map, entry, start); + if (rv != KERN_SUCCESS) + return (rv); *prev_entry = vm_map_entry_pred(entry); } else entry = vm_map_entry_succ(*prev_entry); - return (entry); + *res_entry = entry; + return (KERN_SUCCESS); } /* @@ -2435,31 +2480,41 @@ * the specified address; if necessary, * it splits the entry into two. */ -static inline void -vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end) +static int +vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr) { vm_map_entry_t new_entry; + int bdry_idx; if (!map->system_map) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "%s: map %p entry %p end 0x%jx", __func__, map, entry, - (uintmax_t)end); + (uintmax_t)endaddr); - if (end >= entry->end) - return; + if (endaddr >= entry->end) + return (KERN_SUCCESS); VM_MAP_ASSERT_LOCKED(map); - KASSERT(entry->start < end && entry->end > end, + KASSERT(entry->start < endaddr && entry->end > endaddr, ("%s: invalid clip of entry %p", __func__, entry)); + bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> + MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; + if (bdry_idx != 0) { + if ((endaddr & (pagesizes[bdry_idx] - 1)) != 0) + return (KERN_INVALID_ARGUMENT); + } + new_entry = vm_map_entry_clone(map, entry); /* * Split off the back portion. Insert the new entry AFTER this one, * so that this entry has the specified ending address. */ - new_entry->start = end; + new_entry->start = endaddr; vm_map_entry_link(map, new_entry); + + return (KERN_SUCCESS); } /* @@ -2501,12 +2556,17 @@ if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end && (entry->eflags & MAP_ENTRY_COW) == 0 && entry->object.vm_object == NULL) { - vm_map_clip_start(map, entry, start); - vm_map_clip_end(map, entry, end); + result = vm_map_clip_start(map, entry, start); + if (result != KERN_SUCCESS) + goto unlock; + result = vm_map_clip_end(map, entry, end); + if (result != KERN_SUCCESS) + goto unlock; entry->object.sub_map = submap; entry->eflags |= MAP_ENTRY_IS_SUB_MAP; result = KERN_SUCCESS; } +unlock: vm_map_unlock(map); if (result != KERN_SUCCESS) { @@ -2693,11 +2753,18 @@ * of this loop early and let the next loop simplify the entries, since * some may now be mergeable. */ - rv = KERN_SUCCESS; - vm_map_clip_start(map, first_entry, start); + rv = vm_map_clip_start(map, first_entry, start); + if (rv != KERN_SUCCESS) { + vm_map_unlock(map); + return (rv); + } for (entry = first_entry; entry->start < end; entry = vm_map_entry_succ(entry)) { - vm_map_clip_end(map, entry, end); + rv = vm_map_clip_end(map, entry, end); + if (rv != KERN_SUCCESS) { + vm_map_unlock(map); + return (rv); + } if (set_max || ((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 || @@ -2817,6 +2884,7 @@ int behav) { vm_map_entry_t entry, prev_entry; + int rv; bool modify_map; /* @@ -2862,13 +2930,22 @@ * We clip the vm_map_entry so that behavioral changes are * limited to the specified address range. */ - for (entry = vm_map_lookup_clip_start(map, start, &prev_entry); - entry->start < end; - prev_entry = entry, entry = vm_map_entry_succ(entry)) { + rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry); + if (rv != KERN_SUCCESS) { + vm_map_unlock(map); + return (vm_mmap_to_errno(rv)); + } + + for (; entry->start < end; prev_entry = entry, + entry = vm_map_entry_succ(entry)) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; - vm_map_clip_end(map, entry, end); + rv = vm_map_clip_end(map, entry, end); + if (rv != KERN_SUCCESS) { + vm_map_unlock(map); + return (vm_mmap_to_errno(rv)); + } switch (behav) { case MADV_NORMAL: @@ -3002,7 +3079,8 @@ vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_inherit_t new_inheritance) { - vm_map_entry_t entry, prev_entry; + vm_map_entry_t entry, lentry, prev_entry, start_entry; + int rv; switch (new_inheritance) { case VM_INHERIT_NONE: @@ -3017,18 +3095,37 @@ return (KERN_SUCCESS); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); - for (entry = vm_map_lookup_clip_start(map, start, &prev_entry); - entry->start < end; - prev_entry = entry, entry = vm_map_entry_succ(entry)) { - vm_map_clip_end(map, entry, end); + rv = vm_map_lookup_clip_start(map, start, &start_entry, &prev_entry); + if (rv != KERN_SUCCESS) + goto unlock; + if (vm_map_lookup_entry(map, end, &lentry)) { + rv = vm_map_clip_end(map, lentry, end - 1); + if (rv != KERN_SUCCESS) + goto unlock; + } + if (new_inheritance != VM_INHERIT_SHARE) { + for (entry = start_entry; entry->start < end; + prev_entry = entry, entry = vm_map_entry_succ(entry)) { + if ((entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) + != 0) { + rv = KERN_INVALID_ARGUMENT; + goto unlock; + } + } + } + for (entry = start_entry; entry->start < end; prev_entry = entry, + entry = vm_map_entry_succ(entry)) { + KASSERT(entry->end <= end, ("non-clipped entry %p end %jx %jx", + entry, (uintmax_t)entry->end, (uintmax_t)end)); if ((entry->eflags & MAP_ENTRY_GUARD) == 0 || new_inheritance != VM_INHERIT_ZERO) entry->inheritance = new_inheritance; vm_map_try_merge_entries(map, prev_entry, entry); } vm_map_try_merge_entries(map, prev_entry, entry); +unlock: vm_map_unlock(map); - return (KERN_SUCCESS); + return (rv); } /* @@ -3127,8 +3224,13 @@ next_entry : NULL; continue; } - vm_map_clip_start(map, entry, start); - vm_map_clip_end(map, entry, end); + rv = vm_map_clip_start(map, entry, start); + if (rv != KERN_SUCCESS) + break; + rv = vm_map_clip_end(map, entry, end); + if (rv != KERN_SUCCESS) + break; + /* * Mark the entry in case the map lock is released. (See * above.) @@ -3260,7 +3362,8 @@ * If any pages at the start of this entry were successfully wired, * then unwire them. */ - if (failed_addr > entry->start) { + if (failed_addr > entry->start && (entry->eflags & + MAP_ENTRY_SPLIT_BOUNDARY_SHIFT) == 0) { pmap_unwire(map->pmap, entry->start, failed_addr); vm_object_unwire(entry->object.vm_object, entry->offset, failed_addr - entry->start, PQ_ACTIVE); @@ -3335,8 +3438,13 @@ next_entry : NULL; continue; } - vm_map_clip_start(map, entry, start); - vm_map_clip_end(map, entry, end); + rv = vm_map_clip_start(map, entry, start); + if (rv != KERN_SUCCESS) + goto done; + rv = vm_map_clip_end(map, entry, end); + if (rv != KERN_SUCCESS) + goto done; + /* * Mark the entry in case the map lock is released. (See * above.) @@ -3461,10 +3569,14 @@ * Moreover, another thread could be simultaneously * wiring this new mapping entry. Detect these cases * and skip any entries marked as in transition not by us. + * + * Another way to get an entry not marked with + * MAP_ENTRY_IN_TRANSITION is after failed clipping, + * which set rv to KERN_INVALID_ARGUMENT. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || entry->wiring_thread != curthread) { - KASSERT(holes_ok, + KASSERT(holes_ok || rv == KERN_INVALID_ARGUMENT, ("vm_map_wire: !HOLESOK and new/changed entry")); continue; } @@ -3542,6 +3654,7 @@ vm_object_t object; vm_ooffset_t offset; unsigned int last_timestamp; + int bdry_idx; boolean_t failed; vm_map_lock_read(map); @@ -3553,14 +3666,26 @@ start = first_entry->start; end = first_entry->end; } + /* - * Make a first pass to check for user-wired memory and holes. + * Make a first pass to check for user-wired memory, holes, + * and partial invalidation of largepage mappings. */ for (entry = first_entry; entry->start < end; entry = next_entry) { - if (invalidate && - (entry->eflags & MAP_ENTRY_USER_WIRED) != 0) { - vm_map_unlock_read(map); - return (KERN_INVALID_ARGUMENT); + if (invalidate) { + if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) { + vm_map_unlock_read(map); + return (KERN_INVALID_ARGUMENT); + } + bdry_idx = (entry->eflags & + MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> + MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; + if (bdry_idx != 0 && + ((start & (pagesizes[bdry_idx] - 1)) != 0 || + (end & (pagesizes[bdry_idx] - 1)) != 0)) { + vm_map_unlock_read(map); + return (KERN_INVALID_ARGUMENT); + } } next_entry = vm_map_entry_succ(entry); if (end > entry->end && @@ -3635,9 +3760,11 @@ size = entry->end - entry->start; if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) vm_map_wire_user_count_sub(atop(size)); - pmap_unwire(map->pmap, entry->start, entry->end); - vm_object_unwire(entry->object.vm_object, entry->offset, size, - PQ_ACTIVE); + if ((entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) == 0) { + pmap_unwire(map->pmap, entry->start, entry->end); + vm_object_unwire(entry->object.vm_object, entry->offset, size, + PQ_ACTIVE); + } entry->wired_count = 0; } @@ -3737,7 +3864,8 @@ int vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) { - vm_map_entry_t entry, next_entry; + vm_map_entry_t entry, next_entry, scratch_entry; + int rv; VM_MAP_ASSERT_LOCKED(map); @@ -3748,8 +3876,10 @@ * Find the start of the region, and clip it. * Step through all entries in this region. */ - for (entry = vm_map_lookup_clip_start(map, start, &entry); - entry->start < end; entry = next_entry) { + rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry); + if (rv != KERN_SUCCESS) + return (rv); + for (; entry->start < end; entry = next_entry) { /* * Wait for wiring or unwiring of an entry to complete. * Also wait for any system wirings to disappear on @@ -3773,13 +3903,19 @@ * Specifically, the entry may have been * clipped, merged, or deleted. */ - next_entry = vm_map_lookup_clip_start(map, - saved_start, &next_entry); + rv = vm_map_lookup_clip_start(map, saved_start, + &next_entry, &scratch_entry); + if (rv != KERN_SUCCESS) + break; } else next_entry = entry; continue; } - vm_map_clip_end(map, entry, end); + + /* XXXKIB or delete to the upper superpage boundary ? */ + rv = vm_map_clip_end(map, entry, end); + if (rv != KERN_SUCCESS) + break; next_entry = vm_map_entry_succ(entry); /* @@ -3809,7 +3945,7 @@ */ vm_map_entry_delete(map, entry); } - return (KERN_SUCCESS); + return (rv); } /* Index: sys/vm/vm_mmap.c =================================================================== --- sys/vm/vm_mmap.c +++ sys/vm/vm_mmap.c @@ -218,14 +218,14 @@ struct file *fp; struct proc *p; off_t pos; - vm_offset_t addr; + vm_offset_t addr, orig_addr; vm_size_t len, pageoff, size; vm_prot_t cap_maxprot; int align, error, fd, flags, max_prot, prot; cap_rights_t rights; mmap_check_fp_fn check_fp_fn; - addr = mrp->mr_hint; + orig_addr = addr = mrp->mr_hint; len = mrp->mr_len; prot = mrp->mr_prot; flags = mrp->mr_flags; @@ -421,6 +421,9 @@ if (error != 0) goto done; } + if (fp->f_ops == &shm_ops && (((struct shmfd *)fp->f_data)-> + shm_flags & SHM_LARGEPAGE) != 0) + addr = orig_addr; /* This relies on VM_PROT_* matching PROT_*. */ error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, max_prot & cap_maxprot, flags, pos, td); @@ -577,6 +580,7 @@ vm_offset_t addr, end; vm_size_t pageoff; vm_map_t map; + int rv; if (size == 0) return (EINVAL); @@ -614,10 +618,10 @@ } } #endif - vm_map_delete(map, addr, end); + rv = vm_map_delete(map, addr, end); #ifdef HWPMC_HOOKS - if (__predict_false(pmc_handled)) { + if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) { /* downgrade the lock to prevent a LOR with the pmc-sx lock */ vm_map_lock_downgrade(map); if (pkm.pm_address != (uintptr_t) NULL) @@ -627,8 +631,7 @@ #endif vm_map_unlock(map); - /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ - return (0); + return (vm_mmap_to_errno(rv)); } #ifndef _SYS_SYSPROTO_H_ @@ -1105,7 +1108,14 @@ PROC_UNLOCK(proc); } #endif - return (error == KERN_SUCCESS ? 0 : ENOMEM); + switch (error) { + case KERN_SUCCESS: + return (0); + case KERN_INVALID_ARGUMENT: + return (EINVAL); + default: + return (ENOMEM); + } } #ifndef _SYS_SYSPROTO_H_ @@ -1510,6 +1520,39 @@ return (error); } +int +kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size) +{ + int error; + + RACCT_PROC_LOCK(td->td_proc); + if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { + RACCT_PROC_UNLOCK(td->td_proc); + return (ENOMEM); + } + if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { + RACCT_PROC_UNLOCK(td->td_proc); + return (ENOMEM); + } + if (!old_mlock && map->flags & MAP_WIREFUTURE) { + if (ptoa(pmap_wired_count(map->pmap)) + size > + lim_cur(td, RLIMIT_MEMLOCK)) { + racct_set_force(td->td_proc, RACCT_VMEM, map->size); + RACCT_PROC_UNLOCK(td->td_proc); + return (ENOMEM); + } + error = racct_set(td->td_proc, RACCT_MEMLOCK, + ptoa(pmap_wired_count(map->pmap)) + size); + if (error != 0) { + racct_set_force(td->td_proc, RACCT_VMEM, map->size); + RACCT_PROC_UNLOCK(td->td_proc); + return (error); + } + } + RACCT_PROC_UNLOCK(td->td_proc); + return (0); +} + /* * Internal version of mmap that maps a specific VM object into an * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. @@ -1519,39 +1562,15 @@ vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, boolean_t writecounted, struct thread *td) { - boolean_t curmap, fitit; vm_offset_t max_addr; int docow, error, findspace, rv; + bool curmap, fitit; curmap = map == &td->td_proc->p_vmspace->vm_map; if (curmap) { - RACCT_PROC_LOCK(td->td_proc); - if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { - RACCT_PROC_UNLOCK(td->td_proc); - return (ENOMEM); - } - if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { - RACCT_PROC_UNLOCK(td->td_proc); - return (ENOMEM); - } - if (!old_mlock && map->flags & MAP_WIREFUTURE) { - if (ptoa(pmap_wired_count(map->pmap)) + size > - lim_cur(td, RLIMIT_MEMLOCK)) { - racct_set_force(td->td_proc, RACCT_VMEM, - map->size); - RACCT_PROC_UNLOCK(td->td_proc); - return (ENOMEM); - } - error = racct_set(td->td_proc, RACCT_MEMLOCK, - ptoa(pmap_wired_count(map->pmap)) + size); - if (error != 0) { - racct_set_force(td->td_proc, RACCT_VMEM, - map->size); - RACCT_PROC_UNLOCK(td->td_proc); - return (error); - } - } - RACCT_PROC_UNLOCK(td->td_proc); + error = kern_mmap_racct_check(td, map, size); + if (error != 0) + return (error); } /* Index: sys/vm/vm_object.h =================================================================== --- sys/vm/vm_object.h +++ sys/vm/vm_object.h @@ -173,6 +173,17 @@ struct pctrie swp_blks; vm_ooffset_t writemappings; } swp; + + /* + * Phys pager + */ + struct { + struct phys_pager_ops *ops; + union { + void *data_ptr; + uintptr_t data_val; + }; + } phys; } un_pager; struct ucred *cred; vm_ooffset_t charge; Index: sys/vm/vm_object.c =================================================================== --- sys/vm/vm_object.c +++ sys/vm/vm_object.c @@ -289,6 +289,7 @@ kernel_object->flags |= OBJ_COLORED; kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif + kernel_object->un_pager.phys.ops = &default_phys_pg_ops; /* * The lock portion of struct vm_object must be type stable due Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c +++ sys/vm/vm_page.c @@ -3147,9 +3147,12 @@ return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters); } -void -vm_wait_doms(const domainset_t *wdoms) +int +vm_wait_doms(const domainset_t *wdoms, int mflags) { + int error; + + error = 0; /* * We use racey wakeup synchronization to avoid expensive global @@ -3162,8 +3165,8 @@ if (curproc == pageproc) { mtx_lock(&vm_domainset_lock); vm_pageproc_waiters++; - msleep(&vm_pageproc_waiters, &vm_domainset_lock, PVM | PDROP, - "pageprocwait", 1); + error = msleep(&vm_pageproc_waiters, &vm_domainset_lock, + PVM | PDROP | mflags, "pageprocwait", 1); } else { /* * XXX Ideally we would wait only until the allocation could @@ -3173,11 +3176,12 @@ mtx_lock(&vm_domainset_lock); if (vm_page_count_min_set(wdoms)) { vm_min_waiters++; - msleep(&vm_min_domains, &vm_domainset_lock, - PVM | PDROP, "vmwait", 0); + error = msleep(&vm_min_domains, &vm_domainset_lock, + PVM | PDROP | mflags, "vmwait", 0); } else mtx_unlock(&vm_domainset_lock); } + return (error); } /* @@ -3208,20 +3212,12 @@ panic("vm_wait in early boot"); DOMAINSET_ZERO(&wdom); DOMAINSET_SET(vmd->vmd_domain, &wdom); - vm_wait_doms(&wdom); + vm_wait_doms(&wdom, 0); } } -/* - * vm_wait: - * - * Sleep until free pages are available for allocation in the - * affinity domains of the obj. If obj is NULL, the domain set - * for the calling thread is used. - * Called in various places after failed memory allocations. - */ -void -vm_wait(vm_object_t obj) +static int +vm_wait_flags(vm_object_t obj, int mflags) { struct domainset *d; @@ -3236,7 +3232,27 @@ if (d == NULL) d = curthread->td_domain.dr_policy; - vm_wait_doms(&d->ds_mask); + return (vm_wait_doms(&d->ds_mask, mflags)); +} + +/* + * vm_wait: + * + * Sleep until free pages are available for allocation in the + * affinity domains of the obj. If obj is NULL, the domain set + * for the calling thread is used. + * Called in various places after failed memory allocations. + */ +void +vm_wait(vm_object_t obj) +{ + (void)vm_wait_flags(obj, 0); +} + +int +vm_wait_intr(vm_object_t obj) +{ + return (vm_wait_flags(obj, PCATCH)); } /* Index: sys/vm/vm_pageout.h =================================================================== --- sys/vm/vm_pageout.h +++ sys/vm/vm_pageout.h @@ -97,6 +97,7 @@ */ void vm_wait(vm_object_t obj); +int vm_wait_intr(vm_object_t obj); void vm_waitpfault(struct domainset *, int timo); void vm_wait_domain(int domain); void vm_wait_min(void); Index: sys/vm/vm_pager.h =================================================================== --- sys/vm/vm_pager.h +++ sys/vm/vm_pager.h @@ -229,5 +229,22 @@ vm_object_t cdev_pager_lookup(void *handle); void cdev_pager_free_page(vm_object_t object, vm_page_t m); +struct phys_pager_ops { + int (*phys_pg_getpages)(vm_object_t vm_obj, vm_page_t *m, int count, + int *rbehind, int *rahead); + int (*phys_pg_populate)(vm_object_t vm_obj, vm_pindex_t pidx, + int fault_type, vm_prot_t max_prot, vm_pindex_t *first, + vm_pindex_t *last); + boolean_t (*phys_pg_haspage)(vm_object_t obj, vm_pindex_t pindex, + int *before, int *after); + void (*phys_pg_ctor)(vm_object_t vm_obj, vm_prot_t prot, + vm_ooffset_t foff, struct ucred *cred); + void (*phys_pg_dtor)(vm_object_t vm_obj); +}; +extern struct phys_pager_ops default_phys_pg_ops; +vm_object_t phys_pager_allocate(void *handle, struct phys_pager_ops *ops, + void *data, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, + struct ucred *cred); + #endif /* _KERNEL */ #endif /* _VM_PAGER_ */ Index: sys/vm/vm_unix.c =================================================================== --- sys/vm/vm_unix.c +++ sys/vm/vm_unix.c @@ -188,7 +188,7 @@ rv = vm_map_wire_locked(map, old, new, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); if (rv != KERN_SUCCESS) - vm_map_delete(map, old, new); + (void)vm_map_delete(map, old, new); } if (rv != KERN_SUCCESS) { #ifdef RACCT Index: usr.bin/posixshmcontrol/posixshmcontrol.c =================================================================== --- usr.bin/posixshmcontrol/posixshmcontrol.c +++ usr.bin/posixshmcontrol/posixshmcontrol.c @@ -30,8 +30,10 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include +#include #include #include #include @@ -50,7 +52,7 @@ { fprintf(stderr, "Usage:\n" - "posixshmcontrol create [-m ] ...\n" + "posixshmcontrol create [-m ] [-l largepage] ...\n" "posixshmcontrol rm ...\n" "posixshmcontrol ls [-h] [-n]\n" "posixshmcontrol dump ...\n" @@ -59,14 +61,28 @@ } static int -create_one_shm(const char *path, long mode) +create_one_shm(const char *path, long mode, int idx) { - int fd; + struct shm_largepage_conf slc; + int error, fd; - fd = shm_open(path, O_RDWR | O_CREAT, mode); - if (fd == -1) { - warn("create %s", path); - return (1); + if (idx == -1) { + fd = shm_open(path, O_RDWR | O_CREAT, mode); + if (fd == -1) { + warn("create %s", path); + return (1); + } + } else { + fd = syscall(SYS_shm_open2, path, O_CREAT | O_RDWR, 0666, + SHM_LARGEPAGE, (void *)NULL); + if (fd == -1) + warn("shm_open2 %s", path); + memset(&slc, 0, sizeof(slc)); + slc.psind = idx; + slc.alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT; + error = ioctl(fd, FIOSHMLPGCNF, &slc); + if (error == -1) + err(1, "FIOSHMLPGCNF"); } close(fd); return (0); @@ -76,20 +92,60 @@ create_shm(int argc, char **argv) { char *end; + size_t *pagesizes; long mode; - int c, i, ret, ret1; + uint64_t pgsz; + int c, i, idx, pn, ret, ret1; + bool printed; mode = 0600; - while ((c = getopt(argc, argv, "m:")) != -1) { + idx = -1; + while ((c = getopt(argc, argv, "l:m:")) != -1) { switch (c) { case 'm': errno = 0; mode = strtol(optarg, &end, 0); if (mode == 0 && errno != 0) - err(1, "mode:"); + err(1, "mode"); if (*end != '\0') errx(1, "non-integer mode"); break; + case 'l': + if (expand_number(optarg, &pgsz) == -1) + err(1, "size"); + pn = getpagesizes(NULL, 0); + if (pn == -1) + err(1, "getpagesizes"); + pagesizes = malloc(sizeof(size_t) * pn); + if (pagesizes == NULL) + err(1, "malloc"); + if (getpagesizes(pagesizes, pn) == -1) + err(1, "gtpagesizes"); + for (idx = 0; idx < pn; idx++) { + if (pagesizes[idx] == pgsz) + break; + } + if (idx == pn) { + fprintf(stderr, + "pagesize should be superpagesize, supported sizes:"); + printed = false; + for (i = 0; i < pn; i++) { + if (pagesizes[i] == 0 || + pagesizes[i] == (size_t) + getpagesize()) + continue; + printed = true; + fprintf(stderr, " %zu", pagesizes[i]); + } + if (!printed) + fprintf(stderr, " none"); + fprintf(stderr, "\n"); + exit(1); + } + if (pgsz == (uint64_t)getpagesize()) + errx(1, "pagesize should be large"); + free(pagesizes); + break; case '?': default: usage(); @@ -101,7 +157,7 @@ argv += optind; ret = 0; for (i = 0; i < argc; i++) { - ret1 = create_one_shm(argv[i], mode); + ret1 = create_one_shm(argv[i], mode, idx); if (ret1 != 0 && ret == 0) ret = ret1; } @@ -349,6 +405,9 @@ (long)st.st_ctim.tv_nsec); printf("birth\t%ld.%09ld\n", (long)st.st_birthtim.tv_sec, (long)st.st_birthtim.tv_nsec); + if (st.st_blocks != 0) + printf("pagesz\t%jd\n", roundup((uintmax_t)st.st_size, + PAGE_SIZE) / st.st_blocks); } close(fd); return (ret);