Index: TODO =================================================================== --- /dev/null +++ TODO @@ -0,0 +1,4 @@ +- per-user limit on the total superpages allocations +- man pages +- make pmap_superpagesizes[] per-pmap ? +- more test programs Index: lib/libc/sys/Symbol.map =================================================================== --- lib/libc/sys/Symbol.map +++ lib/libc/sys/Symbol.map @@ -411,6 +411,7 @@ getfhat; funlinkat; memfd_create; + shm_create_largepage; shm_rename; }; @@ -919,6 +920,7 @@ __sys_setuid; _shm_open; __sys_shm_open; + __sys_shm_open2; _shm_unlink; __sys_shm_unlink; _shmat; Index: lib/libc/sys/shm_open.c =================================================================== --- lib/libc/sys/shm_open.c +++ lib/libc/sys/shm_open.c @@ -31,14 +31,17 @@ #include __FBSDID("$FreeBSD$"); -#include +#include +#include #include #include #include #include +#include #include #include +#include #include "libc_private.h" @@ -54,6 +57,51 @@ return (__sys_shm_open2(path, flags | O_CLOEXEC, mode, 0, NULL)); } +int +shm_create_largepage(const char *path, int flags, int psind, int alloc_policy, + mode_t mode) +{ + struct shm_largepage_conf slc; + int error, fd, saved_errno; + + fd = __sys_shm_open2(path, flags | O_CREAT, mode, SHM_LARGEPAGE, NULL); + if (error == -1) + return (-1); + + memset(&slc, 0, sizeof(slc)); + slc.psind = psind; + slc.alloc_policy = alloc_policy; + error = ioctl(fd, FIOSSHMLPGCNF, &slc); + if (error == -1) { + saved_errno = errno; + close(fd); + errno = saved_errno; + return (-1); + } + return (fd); +} + +#define K(x) ((size_t)(x) * 1024) +#define M(x) (K(x) * 1024) +#define G(x) (M(x) * 1024) +static const struct { + int mask; + size_t pgsize; +} mfd_huge_sizes[] = { + { .mask = MFD_HUGE_64KB, .pgsize = K(64) }, + { .mask = MFD_HUGE_512KB, .pgsize = K(512) }, + { .mask = MFD_HUGE_1MB, .pgsize = M(1) }, + { .mask = MFD_HUGE_2MB, .pgsize = M(2) }, + { .mask = MFD_HUGE_8MB, .pgsize = M(8) }, + { .mask = MFD_HUGE_16MB, .pgsize = M(16) }, + { .mask = MFD_HUGE_32MB, .pgsize = M(32) }, + { .mask = MFD_HUGE_256MB, .pgsize = M(256) }, + { .mask = MFD_HUGE_512MB, .pgsize = M(512) }, + { .mask = MFD_HUGE_1GB, .pgsize = G(1) }, + { .mask = MFD_HUGE_2GB, .pgsize = G(2) }, + { .mask = MFD_HUGE_16GB, .pgsize = G(16) }, +}; + /* * The path argument is passed to the kernel, but the kernel doesn't currently * do anything with it. Linux exposes it in linprocfs for debugging purposes @@ -63,8 +111,9 @@ memfd_create(const char *name, unsigned int flags) { char memfd_name[NAME_MAX + 1]; - size_t namelen; - int oflags, shmflags; + size_t namelen, *pgs; + struct shm_largepage_conf slc; + int error, fd, i, npgs, oflags, pgidx, saved_errno, shmflags; if (name == NULL) return (EBADF); @@ -75,11 +124,9 @@ MFD_HUGE_MASK)) != 0) return (EINVAL); /* Size specified but no HUGETLB. */ - if ((flags & MFD_HUGE_MASK) != 0 && (flags & MFD_HUGETLB) == 0) + if (((flags & MFD_HUGE_MASK) != 0 && (flags & MFD_HUGETLB) == 0) || + __bitcount(flags & MFD_HUGE_MASK) > 1) return (EINVAL); - /* We don't actually support HUGETLB. */ - if ((flags & MFD_HUGETLB) != 0) - return (ENOSYS); /* We've already validated that we're sufficiently sized. */ snprintf(memfd_name, NAME_MAX + 1, "%s%s", MEMFD_NAME_PREFIX, name); @@ -89,5 +136,57 @@ oflags |= O_CLOEXEC; if ((flags & MFD_ALLOW_SEALING) != 0) shmflags |= SHM_ALLOW_SEALING; - return (__sys_shm_open2(SHM_ANON, oflags, 0, shmflags, memfd_name)); + if ((flags & MFD_HUGETLB) == 0) + shmflags |= SHM_LARGEPAGE; + fd = __sys_shm_open2(SHM_ANON, oflags, 0, shmflags, memfd_name); + if (fd == -1 || (flags & MFD_HUGETLB) == 0) + return (fd); + + pgs = NULL; + npgs = getpagesizes(NULL, 0); + if (npgs == -1) + goto clean; + pgs = calloc(npgs, sizeof(size_t)); + if (pgs == NULL) + goto clean; + error = getpagesizes(pgs, npgs); + if (error == -1) + goto clean; + if ((flags & MFD_HUGE_MASK) == 0) { + if (npgs == 1) { + errno = EOPNOTSUPP; + goto clean; + } + pgidx = 1; + } else { + for (i = 0; i < nitems(mfd_huge_sizes); i++) { + if (mfd_huge_sizes[i].mask == (flags & MFD_HUGE_MASK)) + break; + } + for (pgidx = 0; pgidx < npgs; pgidx++) { + if (mfd_huge_sizes[i].pgsize == pgs[pgidx]) + break; + } + if (pgidx == npgs) { + errno = EOPNOTSUPP; + goto clean; + } + } + free(pgs); + pgs = NULL; + + memset(&slc, 0, sizeof(slc)); + slc.psind = pgidx; + slc.alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT; + error = ioctl(fd, FIOSSHMLPGCNF, &slc); + if (error == -1) + goto clean; + return (fd); + +clean: + saved_errno = errno; + close(fd); + free(pgs); + errno = saved_errno; + return (-1); } Index: lib/libsysdecode/mkioctls =================================================================== --- lib/libsysdecode/mkioctls +++ lib/libsysdecode/mkioctls @@ -51,6 +51,7 @@ print "#include " print "#include " print "#include " + print "#include " print "#include " print "#include " print "#include " Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -48,7 +48,7 @@ */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. - * Copyright (c) 2014-2019 The FreeBSD Foundation + * Copyright (c) 2014-2020 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, @@ -1415,6 +1415,8 @@ pdpe = pmap_pdpe(pmap, va); if (pdpe == NULL || (*pdpe & PG_V) == 0) return (NULL); + KASSERT((*pdpe & PG_PS) == 0, + ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va)); return (pmap_pdpe_to_pde(pdpe, va)); } @@ -2382,6 +2384,11 @@ KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; + if ((amd_feature & AMDID_PAGE1GB) != 0) { + KASSERT(MAXPAGESIZES > 1 && pagesizes[2] == 0, + ("pmap_init: can't assign to pagesizes[2]")); + pagesizes[2] = NBPDP; + } } /* @@ -3565,6 +3572,7 @@ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { + pdp_entry_t pdpe, *pdpep; pd_entry_t pde, *pdep; pt_entry_t pte, PG_RW, PG_V; vm_page_t m; @@ -3572,23 +3580,42 @@ m = NULL; PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); - PMAP_LOCK(pmap); - pdep = pmap_pde(pmap, va); - if (pdep != NULL && (pde = *pdep)) { - if (pde & PG_PS) { - if ((pde & PG_RW) != 0 || (prot & VM_PROT_WRITE) == 0) - m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | - (va & PDRMASK)); - } else { - pte = *pmap_pde_to_pte(pdep, va); - if ((pte & PG_V) != 0 && - ((pte & PG_RW) != 0 || (prot & VM_PROT_WRITE) == 0)) - m = PHYS_TO_VM_PAGE(pte & PG_FRAME); + + pdpep = pmap_pdpe(pmap, va); + if (pdpep == NULL || ((pdpe = *pdpep) & PG_V) == 0) + goto out; + if ((pdpe & PG_PS) != 0) { + if ((pdpe & PG_RW) != 0 || (prot & VM_PROT_WRITE) == 0) { + m = PHYS_TO_VM_PAGE((pdpe & PG_PS_FRAME) | + (va & PDPMASK)); + goto check_page; } - if (m != NULL && !vm_page_wire_mapped(m)) - m = NULL; + goto out; } + + pdep = pmap_pdpe_to_pde(pdpep, va); + if (pdep == NULL || ((pde = *pdep) & PG_V) == 0) + goto out; + if ((pde & PG_PS) != 0) { + if ((pde & PG_RW) != 0 || (prot & VM_PROT_WRITE) == 0) { + m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | + (va & PDRMASK)); + goto check_page; + } + goto out; + } + + pte = *pmap_pde_to_pte(pdep, va); + if ((pte & PG_V) == 0) + goto out; + if ((pte & PG_RW) != 0 || (prot & VM_PROT_WRITE) == 0) + m = PHYS_TO_VM_PAGE(pte & PG_FRAME); + +check_page: + if (m != NULL && !vm_page_wire_mapped(m)) + m = NULL; +out: PMAP_UNLOCK(pmap); return (m); } @@ -5849,6 +5876,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; + vm_page_t mt; vm_offset_t va_next; pml5_entry_t *pml5e; pml4_entry_t *pml4e; @@ -5912,13 +5940,28 @@ } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); + va_next = (sva + NBPDP) & ~PDPMASK; if ((*pdpe & PG_V) == 0) { - va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } + KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva, + ("pmap_remove of non-transient 1G page " + "pdpe %#lx sva %#lx eva %#lx va_next %#lx", + *pdpe, sva, eva, va_next)); + if ((*pdpe & PG_PS) != 0) { + MPASS(pmap != kernel_pmap); /* XXXKIB */ + MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); + anyvalid = 1; + *pdpe = 0; + pmap_resident_count_dec(pmap, NBPDP / PAGE_SIZE); + mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME); + pmap_unwire_ptp(pmap, sva, mt, &free); + continue; + } + /* * Calculate index for next page table. */ @@ -6134,11 +6177,13 @@ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { + vm_page_t m; vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; + pt_entry_t obits, pbits; boolean_t anychanged; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); @@ -6188,13 +6233,36 @@ } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); + va_next = (sva + NBPDP) & ~PDPMASK; if ((*pdpe & PG_V) == 0) { - va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } + KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva, + ("pmap_remove of non-transient 1G page " + "pdpe %#lx sva %#lx eva %#lx va_next %#lx", + *pdpe, sva, eva, va_next)); + if ((*pdpe & PG_PS) != 0) { +retry_pdpe: + obits = pbits = *pdpe; + MPASS((pbits & (PG_MANAGED | PG_G)) == 0); + MPASS(pmap != kernel_pmap); /* XXXKIB */ + if ((prot & VM_PROT_WRITE) == 0) + pbits &= ~(PG_RW | PG_M); + if ((prot & VM_PROT_EXECUTE) == 0) + pbits |= pg_nx; + + if (pbits != obits) { + if (!atomic_cmpset_long(pdpe, obits, pbits)) + /* PG_PS cannot be cleared under us, */ + goto retry_pdpe; + anychanged = TRUE; + } + continue; + } + va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; @@ -6237,9 +6305,6 @@ for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { - pt_entry_t obits, pbits; - vm_page_t m; - retry: obits = pbits = *pte; if ((pbits & PG_V) == 0) @@ -6414,6 +6479,119 @@ } #endif /* VM_NRESERVLEVEL > 0 */ +static int +pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, + int psind) +{ + vm_page_t mp; + pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT(psind > 0 && psind < MAXPAGESIZES, + ("psind %d unexpected", psind)); + KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0, + ("unaligned phys address %#lx newpte %#lx psind %d", + newpte & PG_FRAME, newpte, psind)); + KASSERT((va & (pagesizes[psind] - 1)) == 0, + ("unaligned va %#lx psind %d", va, psind)); + KASSERT(va < VM_MAXUSER_ADDRESS, + ("kernel mode non-transparent superpage")); /* XXXKIB */ + KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS, + ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */ + + PG_V = pmap_valid_bit(pmap); + +restart: + pten = newpte; + if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) + pten |= pmap_pkru_get(pmap, va); + + if (psind == 2) { /* 1G */ + if (!pmap_pkru_same(pmap, va, va + NBPDP)) + return (KERN_PROTECTION_FAILURE); + pml4e = pmap_pml4e(pmap, va); + if ((*pml4e & PG_V) == 0) { + mp = _pmap_allocpte(pmap, pmap_pml4e_pindex(va), + NULL, va); + if (mp == NULL) { + if ((flags & PMAP_ENTER_NOSLEEP) != 0) + return (KERN_RESOURCE_SHORTAGE); + PMAP_UNLOCK(pmap); + vm_wait(NULL); + PMAP_LOCK(pmap); + + /* + * Restart at least to recalcuate the pkru + * key. Our caller must keep the map locked + * so no paging structure can be validated + * under us. + */ + goto restart; + } + pdpe = pmap_pdpe(pmap, va); + KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va)); + origpte = *pdpe; + MPASS(origpte == 0); + } else { + mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); + pdpe = pmap_pdpe(pmap, va); + KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va)); + origpte = *pdpe; + if ((origpte & PG_V) == 0) + mp->ref_count++; + } + KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 && + (origpte & PG_FRAME) == (newpte & PG_FRAME)), + ("va %#lx changing 1G phys page pdpe %#lx newpte %#lx", + va, origpte, newpte)); + if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) + pmap->pm_stats.wired_count += NBPDP / PAGE_SIZE; + else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) + pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE; + *pdpe = newpte; + } else /* (psind == 1) */ { /* 2M */ + if (!pmap_pkru_same(pmap, va, va + NBPDR)) + return (KERN_PROTECTION_FAILURE); + pde = pmap_pde(pmap, va); + if (pde == NULL) { + mp = _pmap_allocpte(pmap, pmap_pdpe_pindex(va), + NULL, va); + if (mp == NULL) { + if ((flags & PMAP_ENTER_NOSLEEP) != 0) + return (KERN_RESOURCE_SHORTAGE); + PMAP_UNLOCK(pmap); + vm_wait(NULL); + PMAP_LOCK(pmap); + goto restart; + } + pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); + pde = &pde[pmap_pde_index(va)]; + origpte = *pde; + MPASS(origpte == 0); + } else { + pdpe = pmap_pdpe(pmap, va); + MPASS(pdpe != NULL && (*pdpe & PG_V) != 0); + mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); + origpte = *pde; + if ((origpte & PG_V) == 0) + mp->ref_count++; + } + KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 && + (origpte & PG_FRAME) == (newpte & PG_FRAME)), + ("va %#lx changing 2M phys page pde %#lx newpte %#lx", + va, origpte, newpte)); + if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) + pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; + else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) + pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; + *pde = newpte; + } + if ((origpte & PG_V) == 0) + pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE); + + return (KERN_SUCCESS); +} + /* * Insert the given physical page (p) at * the specified virtual address (v) in the @@ -6493,6 +6671,13 @@ lock = NULL; PMAP_LOCK(pmap); + if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { + KASSERT((m->oflags & VPO_UNMANAGED) != 0, + ("managed largepage va %#lx flags %#x", va, flags)); + rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags, + psind); + goto out; + } if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); @@ -7179,9 +7364,10 @@ pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde; - pt_entry_t *pte, PG_V; + pt_entry_t *pte, PG_V, PG_G; PG_V = pmap_valid_bit(pmap); + PG_G = pmap_global_bit(pmap); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); @@ -7192,12 +7378,23 @@ continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); - if ((*pdpe & PG_V) == 0) { - va_next = (sva + NBPDP) & ~PDPMASK; - if (va_next < sva) - va_next = eva; + va_next = (sva + NBPDP) & ~PDPMASK; + if (va_next < sva) + va_next = eva; + if ((*pdpe & PG_V) == 0) + continue; + KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva, + ("pmap_unwire of non-transient 1G page " + "pdpe %#lx sva %#lx eva %#lx va_next %#lx", + *pdpe, sva, eva, va_next)); + if ((*pdpe & PG_PS) != 0) { + MPASS(pmap != kernel_pmap); /* XXXKIB */ + MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); + atomic_clear_long(pdpe, PG_W); + pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE; continue; } + va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; @@ -7314,6 +7511,12 @@ } va_next = (addr + NBPDR) & ~PDRMASK; + KASSERT((*pdpe & PG_PS) == 0 || va_next <= end_addr, + ("pmap_copy of partial non-transient 1G page " + "pdpe %#lx sva %#lx eva %#lx va_next %#lx", + *pdpe, addr, end_addr, va_next)); + if ((*pdpe & PG_PS) != 0) + continue; if (va_next < addr) va_next = end_addr; @@ -8370,6 +8573,12 @@ va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; + KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva, + ("pmap_advise of non-transient 1G page " + "pdpe %#lx sva %#lx eva %#lx va_next %#lx", + *pdpe, sva, eva, va_next)); + if ((*pdpe & PG_PS) != 0) + continue; pde = pmap_pdpe_to_pde(pdpe, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) @@ -9131,6 +9340,7 @@ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { + pdp_entry_t *pdpe; pd_entry_t *pdep; pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa; @@ -9142,23 +9352,32 @@ PG_RW = pmap_rw_bit(pmap); PMAP_LOCK(pmap); - pdep = pmap_pde(pmap, addr); - if (pdep != NULL && (*pdep & PG_V)) { - if (*pdep & PG_PS) { - pte = *pdep; - /* Compute the physical address of the 4KB page. */ - pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & + pte = 0; + pa = 0; + val = 0; + pdpe = pmap_pdpe(pmap, addr); + if ((*pdpe & PG_V) != 0) { + if ((*pdpe & PG_PS) != 0) { + pte = *pdpe; + pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) & PG_FRAME; - val = MINCORE_PSIND(1); + val = MINCORE_PSIND(2); } else { - pte = *pmap_pde_to_pte(pdep, addr); - pa = pte & PG_FRAME; - val = 0; + pdep = pmap_pde(pmap, addr); + if (pdep != NULL && (*pdep & PG_V) != 0) { + if ((*pdep & PG_PS) != 0) { + pte = *pdep; + /* Compute the physical address of the 4KB page. */ + pa = ((pte & PG_PS_FRAME) | (addr & + PDRMASK)) & PG_FRAME; + val = MINCORE_PSIND(1); + } else { + pte = *pmap_pde_to_pte(pdep, addr); + pa = pte & PG_FRAME; + val = 0; + } + } } - } else { - pte = 0; - pa = 0; - val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; Index: sys/kern/kern_umtx.c =================================================================== --- sys/kern/kern_umtx.c +++ sys/kern/kern_umtx.c @@ -3933,7 +3933,7 @@ reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO); reg->ushm_refcnt = 1; bcopy(key, ®->ushm_key, sizeof(*key)); - reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR); + reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR, false); reg->ushm_cred = crhold(cred); error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE); if (error != 0) { Index: sys/kern/uipc_shm.c =================================================================== --- sys/kern/uipc_shm.c +++ sys/kern/uipc_shm.c @@ -2,6 +2,7 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson + * Copyright 2020 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of @@ -9,6 +10,9 @@ * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -80,6 +84,7 @@ #include #include #include +#include #include #include #include @@ -120,6 +125,8 @@ static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); +static int shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, + void *rl_cookie); static int shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie); static int shm_copyin_path(struct thread *td, const char *userpath_in, @@ -159,11 +166,19 @@ .fo_get_seals = shm_get_seals, .fo_add_seals = shm_add_seals, .fo_fallocate = shm_fallocate, - .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE + .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE, }; FEATURE(posix_shm, "POSIX shared memory"); +static SYSCTL_NODE(_vm, OID_AUTO, largepages, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + ""); + +static int largepage_reclaim_tries = 1; +SYSCTL_INT(_vm_largepages, OID_AUTO, reclaim_tries, + CTLFLAG_RWTUN, &largepage_reclaim_tries, 0, + "Number of contig reclaims before giving up for default alloc policy"); + static int uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) { @@ -242,6 +257,89 @@ return (error); } +static u_long count_largepages[MAXPAGESIZES]; + +static int +shm_largepage_phys_populate(vm_object_t object, vm_pindex_t pidx, + int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) +{ + vm_page_t m; + int psind; + + psind = object->un_pager.phys.data_val; + if (psind == 0 || pidx >= object->size) + return (VM_PAGER_FAIL); + *first = rounddown2(pidx, pagesizes[psind] / PAGE_SIZE); + + /* + * We only busy the first page in the superpage run. It is + * useless to busy whole run since we only remove full + * superpage, and it takes too long to busy e.g. 512 * 512 == + * 262144 pages constituing 1G amd64 superage. + */ + m = vm_page_grab(object, *first, VM_ALLOC_NORMAL | VM_ALLOC_NOCREAT); + MPASS(m != NULL); + + *last = *first + atop(pagesizes[psind]) - 1; + return (VM_PAGER_OK); +} + +static boolean_t +shm_largepage_phys_haspage(vm_object_t object, vm_pindex_t pindex, + int *before, int *after) +{ + int psind; + + psind = object->un_pager.phys.data_val; + if (psind == 0 || pindex >= object->size) + return (FALSE); + if (before != NULL) { + *before = pindex - rounddown2(pindex, pagesizes[psind] / + PAGE_SIZE); + } + if (after != NULL) { + *after = roundup2(pindex, pagesizes[psind] / PAGE_SIZE) - + pindex; + } + return (TRUE); +} + +static void +shm_largepage_phys_ctor(vm_object_t object, vm_prot_t prot, + vm_ooffset_t foff, struct ucred *cred) +{ +} + +static void +shm_largepage_phys_dtor(vm_object_t object) +{ + int psind; + + psind = object->un_pager.phys.data_val; + if (psind != 0) { + atomic_subtract_long(&count_largepages[psind], + object->size / (pagesizes[psind] / PAGE_SIZE)); + vm_wire_sub(object->size); + } else { + KASSERT(object->size == 0, + ("largepage phys obj %p not initialized bit size %#jx > 0", + object, (uintmax_t)object->size)); + } +} + +static struct phys_pager_ops shm_largepage_phys_ops = { + .phys_pg_populate = shm_largepage_phys_populate, + .phys_pg_haspage = shm_largepage_phys_haspage, + .phys_pg_ctor = shm_largepage_phys_ctor, + .phys_pg_dtor = shm_largepage_phys_dtor, +}; + +bool +shm_largepage(struct shmfd *shmfd) +{ + return (shmfd->shm_object->type == OBJT_PHYS); +} + static int shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) { @@ -321,6 +419,8 @@ if (error) return (error); #endif + if (shm_largepage(shmfd) && shmfd->shm_lp_psind == 0) + return (EINVAL); foffset_lock_uio(fp, uio, flags); if (uio->uio_resid > OFF_MAX - uio->uio_offset) { /* @@ -385,7 +485,11 @@ shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { + struct shmfd *shmfd; + struct shm_largepage_conf *conf; + void *rl_cookie; + shmfd = fp->f_data; switch (com) { case FIONBIO: case FIOASYNC: @@ -394,6 +498,38 @@ * just like it would on an unlinked regular file */ return (0); + case FIOSSHMLPGCNF: + if (!shm_largepage(shmfd)) + return (ENOTTY); + conf = data; + if (shmfd->shm_lp_psind != 0 && + conf->psind != shmfd->shm_lp_psind) + return (EINVAL); + if (conf->psind <= 0 || conf->psind >= MAXPAGESIZES || + pagesizes[conf->psind] == 0) + return (EINVAL); + if (conf->alloc_policy != SHM_LARGEPAGE_ALLOC_DEFAULT && + conf->alloc_policy != SHM_LARGEPAGE_ALLOC_NOWAIT && + conf->alloc_policy != SHM_LARGEPAGE_ALLOC_HARD) + return (EINVAL); + + rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, + &shmfd->shm_mtx); + shmfd->shm_lp_psind = conf->psind; + shmfd->shm_lp_alloc_policy = conf->alloc_policy; + shmfd->shm_object->un_pager.phys.data_val = conf->psind; + rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); + return (0); + case FIOGSHMLPGCNF: + if (!shm_largepage(shmfd)) + return (ENOTTY); + conf = data; + rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, OFF_MAX, + &shmfd->shm_mtx); + conf->psind = shmfd->shm_lp_psind; + conf->alloc_policy = shmfd->shm_lp_alloc_policy; + rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); + return (0); default: return (ENOTTY); } @@ -436,6 +572,8 @@ sb->st_dev = shm_dev_ino; sb->st_ino = shmfd->shm_ino; sb->st_nlink = shmfd->shm_object->ref_count; + sb->st_blocks = shmfd->shm_object->size / + (pagesizes[shmfd->shm_lp_psind] >> PAGE_SHIFT); return (0); } @@ -592,6 +730,108 @@ return (0); } +static int +shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie) +{ + vm_object_t object; + vm_page_t m; + vm_pindex_t newobjsz, oldobjsz; + int aflags, error, i, psind, try; + + KASSERT(length >= 0, ("shm_dotruncate: length < 0")); + object = shmfd->shm_object; + VM_OBJECT_ASSERT_WLOCKED(object); + rangelock_cookie_assert(rl_cookie, RA_WLOCKED); + + oldobjsz = object->size; + newobjsz = OFF_TO_IDX(length); + if (length == shmfd->shm_size) + return (0); + psind = shmfd->shm_lp_psind; + if (psind == 0 && length != 0) + return (EINVAL); + if ((length & (pagesizes[psind] - 1)) != 0) + return (EINVAL); + + if (length < shmfd->shm_size) { + if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0) + return (EPERM); + if (shmfd->shm_kmappings > 0) + return (EBUSY); + return (ENOTSUP); /* Pages are unmanaged. */ +#if 0 + vm_object_page_remove(object, newobjsz, oldobjsz, 0); + object->size = newobjsz; + shmfd->shm_size = length; + return (0); +#endif + } + + aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO; + if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT) + aflags |= VM_ALLOC_WAITFAIL; + try = 0; + + /* + * Extend shmfd and object, keeping all already fully + * allocated large pages intact even on error, because dropped + * object lock might allowed mapping of them. + */ + while (object->size < newobjsz) { + m = vm_page_alloc_contig(object, object->size, aflags, + pagesizes[psind] / PAGE_SIZE, 0, ~0, + pagesizes[psind], 0, + VM_MEMATTR_DEFAULT); + if (m == NULL) { + VM_OBJECT_WUNLOCK(object); + if (shmfd->shm_lp_alloc_policy == + SHM_LARGEPAGE_ALLOC_NOWAIT || + (shmfd->shm_lp_alloc_policy == + SHM_LARGEPAGE_ALLOC_DEFAULT && + try >= largepage_reclaim_tries)) { + VM_OBJECT_WLOCK(object); + return (ENOMEM); + } + error = vm_page_reclaim_contig(aflags, + pagesizes[psind] / PAGE_SIZE, 0, ~0, + pagesizes[psind], 0) ? 0 : + vm_wait_intr(object); + if (error != 0) { + VM_OBJECT_WLOCK(object); + return (error); + } + try++; + VM_OBJECT_WLOCK(object); + continue; + } + try = 0; + for (i = 0; i < pagesizes[psind] / PAGE_SIZE; i++) { + if ((m[i].flags & PG_ZERO) == 0) + pmap_zero_page(&m[i]); + vm_page_valid(&m[i]); + vm_page_xunbusy(&m[i]); + } + object->size += OFF_TO_IDX(pagesizes[psind]); + shmfd->shm_size += pagesizes[psind]; + atomic_add_long(&count_largepages[psind], 1); + vm_wire_add(atop(pagesizes[psind])); + } + return (0); +} + +static int +shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, void *rl_cookie) +{ + int error; + + VM_OBJECT_WLOCK(shmfd->shm_object); + error = shm_largepage(shmfd) ? shm_dotruncate_largepage(shmfd, + length, rl_cookie) : shm_dotruncate_locked(shmfd, length, + rl_cookie); + VM_OBJECT_WUNLOCK(shmfd->shm_object); + return (error); +} + int shm_dotruncate(struct shmfd *shmfd, off_t length) { @@ -600,9 +840,7 @@ rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); - VM_OBJECT_WLOCK(shmfd->shm_object); - error = shm_dotruncate_locked(shmfd, length, rl_cookie); - VM_OBJECT_WUNLOCK(shmfd->shm_object); + error = shm_dotruncate_cookie(shmfd, length, rl_cookie); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); return (error); } @@ -612,7 +850,7 @@ * routines. */ struct shmfd * -shm_alloc(struct ucred *ucred, mode_t mode) +shm_alloc(struct ucred *ucred, mode_t mode, bool largepage) { struct shmfd *shmfd; @@ -621,8 +859,15 @@ shmfd->shm_uid = ucred->cr_uid; shmfd->shm_gid = ucred->cr_gid; shmfd->shm_mode = mode; - shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL, - shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); + if (largepage) { + shmfd->shm_object = phys_pager_allocate(NULL, + &shm_largepage_phys_ops, NULL, shmfd->shm_size, + VM_PROT_DEFAULT, 0, ucred); + shmfd->shm_lp_alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT; + } else { + shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL, + shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); + } KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); vfs_timestamp(&shmfd->shm_birthtime); shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = @@ -684,14 +929,11 @@ return (error); } -/* - * Dictionary management. We maintain an in-kernel dictionary to map - * paths to shmfd objects. We use the FNV hash on the path to store - * the mappings in a hash table. - */ static void shm_init(void *arg) { + char name[32]; + int i; mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); sx_init(&shm_dict_lock, "shm dictionary"); @@ -699,9 +941,32 @@ new_unrhdr64(&shm_ino_unr, 1); shm_dev_ino = devfs_alloc_cdp_inode(); KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); + + for (i = 1; i < MAXPAGESIZES; i++) { + if (pagesizes[i] == 0) + break; +#define M (1024 * 1024) +#define G (1024 * M) + if (pagesizes[i] >= G) + snprintf(name, sizeof(name), "%luG", pagesizes[i] / G); + else if (pagesizes[i] >= M) + snprintf(name, sizeof(name), "%luM", pagesizes[i] / M); + else + snprintf(name, sizeof(name), "%lu", pagesizes[i]); +#undef G +#undef M + SYSCTL_ADD_ULONG(NULL, SYSCTL_STATIC_CHILDREN(_vm_largepages), + OID_AUTO, name, CTLFLAG_RD, &count_largepages[i], + "number of non-transient largepages allocated"); + } } SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); +/* + * Dictionary management. We maintain an in-kernel dictionary to map + * paths to shmfd objects. We use the FNV hash on the path to store + * the mappings in a hash table. + */ static struct shmfd * shm_lookup(char *path, Fnv32_t fnv) { @@ -773,8 +1038,10 @@ Fnv32_t fnv; mode_t cmode; int error, fd, initial_seals; + bool largepage; - if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE)) != 0) + if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE | + SHM_LARGEPAGE)) != 0) return (EINVAL); initial_seals = F_SEAL_SEAL; @@ -798,6 +1065,12 @@ if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) return (EINVAL); + largepage = (shmflags & SHM_LARGEPAGE) != 0; +#if !defined(__amd64__) + if (largepage) + return (ENOTTY); +#endif + /* * Currently only F_SEAL_SEAL may be set when creating or opening shmfd. * If the decision is made later to allow additional seals, care must be @@ -831,7 +1104,7 @@ fdrop(fp, td); return (EINVAL); } - shmfd = shm_alloc(td->td_ucred, cmode); + shmfd = shm_alloc(td->td_ucred, cmode, largepage); shmfd->shm_seals = initial_seals; shmfd->shm_flags = shmflags; } else { @@ -854,7 +1127,8 @@ path); if (error == 0) { #endif - shmfd = shm_alloc(td->td_ucred, cmode); + shmfd = shm_alloc(td->td_ucred, cmode, + largepage); shmfd->shm_seals = initial_seals; shmfd->shm_flags = shmflags; shm_insert(path, fnv, shmfd); @@ -1139,7 +1413,118 @@ return (error); } -int +static int +shm_mmap_large(struct shmfd *shmfd, vm_map_t map, vm_offset_t *addr, + vm_size_t size, vm_prot_t prot, vm_prot_t max_prot, int flags, + vm_ooffset_t foff, bool writecounted, struct thread *td) +{ + struct vmspace *vms; + vm_map_entry_t next_entry, prev_entry; + vm_offset_t align, mask, maxaddr; + int docow, error, rv, try; + bool curmap; + + if (shmfd->shm_lp_psind == 0) + return (EINVAL); + + /* MAP_PRIVATE is disabled */ + if ((flags & ~(MAP_SHARED | MAP_FIXED | MAP_EXCL | + MAP_NOCORE | +#ifdef MAP_32BIT + MAP_32BIT | +#endif + MAP_ALIGNMENT_MASK)) != 0) + return (EINVAL); + + vms = td->td_proc->p_vmspace; + curmap = map == &vms->vm_map; + if (curmap) { + error = kern_mmap_racct_check(td, map, size); + if (error != 0) + return (error); + } + + docow = shmfd->shm_lp_psind << MAP_SPLIT_BOUNDARY_SHIFT; + docow |= MAP_INHERIT_SHARE; + if ((flags & MAP_NOCORE) != 0) + docow |= MAP_DISABLE_COREDUMP; + if (writecounted) + docow |= MAP_WRITECOUNT; + + mask = pagesizes[shmfd->shm_lp_psind] - 1; + if ((foff & mask) != 0) + return (EINVAL); + maxaddr = vm_map_max(map); +#ifdef MAP_32BIT + if ((flags & MAP_32BIT) != 0 && maxaddr > MAP_32BIT_MAX_ADDR) + maxaddr = MAP_32BIT_MAX_ADDR; +#endif + if (size == 0 || (size & mask) != 0 || + (*addr != 0 && ((*addr & mask) != 0 || + *addr + size < *addr || *addr + size > maxaddr))) + return (EINVAL); + + align = flags & MAP_ALIGNMENT_MASK; + if (align == 0) { + align = pagesizes[shmfd->shm_lp_psind]; + } else if (align == MAP_ALIGNED_SUPER) { + if (shmfd->shm_lp_psind != 1) + return (EINVAL); + align = pagesizes[1]; + } else { + align >>= MAP_ALIGNMENT_SHIFT; + align = 1ULL << align; + /* Also handles overflow. */ + if (align < pagesizes[shmfd->shm_lp_psind]) + return (EINVAL); + } + + vm_map_lock(map); + if ((flags & MAP_FIXED) == 0) { + try = 1; + if (curmap && (*addr == 0 || + (*addr >= round_page((vm_offset_t)vms->vm_taddr) && + *addr < round_page((vm_offset_t)vms->vm_daddr + + lim_max(td, RLIMIT_DATA))))) { + *addr = roundup2((vm_offset_t)vms->vm_daddr + + lim_max(td, RLIMIT_DATA), + pagesizes[shmfd->shm_lp_psind]); + } +again: + rv = vm_map_find_aligned(map, addr, size, maxaddr, align); + if (rv != KERN_SUCCESS) { + if (try == 1) { + try = 2; + *addr = vm_map_min(map); + if ((*addr & mask) != 0) + *addr = (*addr + mask) & mask; + goto again; + } + goto fail1; + } + } else if ((flags & MAP_EXCL) == 0) { + rv = vm_map_delete(map, *addr, *addr + size); + if (rv != KERN_SUCCESS) + goto fail1; + } else { + error = ENOSPC; + if (vm_map_lookup_entry(map, *addr, &prev_entry)) + goto fail; + next_entry = vm_map_entry_succ(prev_entry); + if (next_entry->start < *addr + size) + goto fail; + } + + rv = vm_map_insert(map, shmfd->shm_object, foff, *addr, *addr + size, + prot, max_prot, docow); +fail1: + error = vm_mmap_to_errno(rv); +fail: + vm_map_unlock(map); + return (error); +} + +static int shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) @@ -1211,8 +1596,13 @@ if (writecnt) vm_pager_update_writecount(shmfd->shm_object, 0, objsize); - error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, - shmfd->shm_object, foff, writecnt, td); + if (shm_largepage(shmfd)) { + error = shm_mmap_large(shmfd, map, addr, objsize, prot, + maxprot, flags, foff, writecnt, td); + } else { + error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, + shmfd->shm_object, foff, writecnt, td); + } if (error != 0) { if (writecnt) vm_pager_release_writecount(shmfd->shm_object, 0, @@ -1503,11 +1893,8 @@ */ rl_cookie = rangelock_wlock(&shmfd->shm_rl, offset, size, &shmfd->shm_mtx); - if (size > shmfd->shm_size) { - VM_OBJECT_WLOCK(shmfd->shm_object); - error = shm_dotruncate_locked(shmfd, size, rl_cookie); - VM_OBJECT_WUNLOCK(shmfd->shm_object); - } + if (size > shmfd->shm_size) + error = shm_dotruncate_cookie(shmfd, size, rl_cookie); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); /* Translate to posix_fallocate(2) return value as needed. */ if (error == ENOMEM) Index: sys/sys/filio.h =================================================================== --- sys/sys/filio.h +++ sys/sys/filio.h @@ -70,6 +70,9 @@ }; /* Get the file's bmap info for the logical block bn. */ #define FIOBMAP2 _IOWR('f', 99, struct fiobmap2_arg) +/* POSIX shm largepage set/get config */ +#define FIOSSHMLPGCNF _IOW('f', 100, struct shm_largepage_conf) +#define FIOGSHMLPGCNF _IOR('f', 101, struct shm_largepage_conf) #ifdef _KERNEL #ifdef COMPAT_FREEBSD32 Index: sys/sys/mman.h =================================================================== --- sys/sys/mman.h +++ sys/sys/mman.h @@ -192,6 +192,17 @@ */ #define SHM_ALLOW_SEALING 0x00000001 #define SHM_GROW_ON_WRITE 0x00000002 +#define SHM_LARGEPAGE 0x00000004 + +#define SHM_LARGEPAGE_ALLOC_DEFAULT 0 +#define SHM_LARGEPAGE_ALLOC_NOWAIT 1 +#define SHM_LARGEPAGE_ALLOC_HARD 2 + +struct shm_largepage_conf { + int psind; + int alloc_policy; + int pad[10]; +}; /* * Flags for memfd_create(). @@ -199,7 +210,6 @@ #define MFD_CLOEXEC 0x00000001 #define MFD_ALLOW_SEALING 0x00000002 -/* UNSUPPORTED */ #define MFD_HUGETLB 0x00000004 #define MFD_HUGE_MASK 0xFC000000 @@ -282,6 +292,10 @@ int shm_flags; int shm_seals; + + /* largepage config */ + int shm_lp_psind; + int shm_lp_alloc_policy; }; #endif @@ -290,12 +304,16 @@ int shm_unmap(struct file *fp, void *mem, size_t size); int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags); -struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode); +struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode, bool largepage); struct shmfd *shm_hold(struct shmfd *shmfd); void shm_drop(struct shmfd *shmfd); int shm_dotruncate(struct shmfd *shmfd, off_t length); +bool shm_largepage(struct shmfd *shmfd); extern struct fileops shm_ops; + +#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) + #else /* !_KERNEL */ __BEGIN_DECLS @@ -329,6 +347,7 @@ #endif #if __BSD_VISIBLE int memfd_create(const char *, unsigned int); +int shm_create_largepage(const char *, int, int, int, mode_t); int shm_rename(const char *, const char *, int); #endif __END_DECLS Index: sys/vm/pmap.h =================================================================== --- sys/vm/pmap.h +++ sys/vm/pmap.h @@ -106,6 +106,7 @@ */ #define PMAP_ENTER_NOSLEEP 0x00000100 #define PMAP_ENTER_WIRED 0x00000200 +#define PMAP_ENTER_LARGEPAGE 0x00000400 #define PMAP_ENTER_RESERVED 0xFF000000 /* Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -420,7 +420,7 @@ vm_offset_t vaddr; vm_page_t m; vm_pindex_t map_first, map_last, pager_first, pager_last, pidx; - int i, npages, psind, rv; + int bdry_idx, i, npages, psind, rv; MPASS(fs->object == fs->first_object); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); @@ -442,7 +442,8 @@ * to the driver. */ rv = vm_pager_populate(fs->first_object, fs->first_pindex, - fs->fault_type, fs->entry->max_protection, &pager_first, &pager_last); + fs->fault_type, fs->entry->max_protection, &pager_first, + &pager_last); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); if (rv == VM_PAGER_BAD) { @@ -465,15 +466,57 @@ MPASS(pager_last < fs->first_object->size); vm_fault_restore_map_lock(fs); + bdry_idx = (fs->entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> + MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; if (fs->map->timestamp != fs->map_generation) { - vm_fault_populate_cleanup(fs->first_object, pager_first, - pager_last); + if (bdry_idx == 0) { + vm_fault_populate_cleanup(fs->first_object, pager_first, + pager_last); + } else { + m = vm_page_lookup(fs->first_object, pager_first); + if (m != fs->m) + vm_page_xunbusy(m); + } return (KERN_RESTART); } /* * The map is unchanged after our last unlock. Process the fault. * + * First, the special case of largepage mappings, where + * populate only busies the first page in superpage run. + */ + if (bdry_idx != 0) { + m = vm_page_lookup(fs->first_object, pager_first); + vm_fault_populate_check_page(m); + VM_OBJECT_WUNLOCK(fs->first_object); + vaddr = fs->entry->start + IDX_TO_OFF(pager_first) - + fs->entry->offset; + /* assert alignment for entry */ + KASSERT((vaddr & (pagesizes[bdry_idx] - 1)) == 0, + ("unaligned superpage start %#jx pager_first %#jx offset %#jx vaddr %#jx", + (uintmax_t)fs->entry->start, (uintmax_t)pager_first, + (uintmax_t)fs->entry->offset, (uintmax_t)vaddr)); + KASSERT((VM_PAGE_TO_PHYS(m) & (pagesizes[bdry_idx] - 1)) == 0, + ("unaligned superpage m %p %#jx", m, + (uintmax_t)VM_PAGE_TO_PHYS(m))); + rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot, + fs->fault_type | (fs->wired ? PMAP_ENTER_WIRED : 0) | + PMAP_ENTER_LARGEPAGE, bdry_idx); + VM_OBJECT_WLOCK(fs->first_object); + vm_page_xunbusy(m); + if ((fs->fault_flags & VM_FAULT_WIRE) != 0) { + for (i = 0; i < atop(pagesizes[bdry_idx]); i++) + vm_page_wire(m + i); + } + if (fs->m_hold != NULL) { + *fs->m_hold = m + (fs->first_pindex - pager_first); + vm_page_wire(*fs->m_hold); + } + goto out; + } + + /* * The range [pager_first, pager_last] that is given to the * pager is only a hint. The pager may populate any range * within the object that includes the requested page index. @@ -539,6 +582,7 @@ vm_page_xunbusy(&m[i]); } } +out: curthread->td_ru.ru_majflt++; return (KERN_SUCCESS); } @@ -1253,6 +1297,7 @@ * multiple page faults of a similar type to run in parallel. */ if (fs.vp == NULL /* avoid locked vnode leak */ && + (fs.entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) == 0 && (fs.fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0) { VM_OBJECT_RLOCK(fs.first_object); rv = vm_fault_soft_fast(&fs); @@ -1285,6 +1330,27 @@ */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; + + if ((fs.entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) != 0) { + rv = vm_fault_allocate(&fs); + switch (rv) { + case KERN_RESTART: + unlock_and_deallocate(&fs); + /* FALLTHROUGH */ + case KERN_RESOURCE_SHORTAGE: + goto RetryFault; + case KERN_SUCCESS: + case KERN_FAILURE: + case KERN_OUT_OF_BOUNDS: + unlock_and_deallocate(&fs); + return (rv); + case KERN_NOT_RECEIVER: + break; + default: + panic("vm_fault: Unhandled rv %d", rv); + } + } + while (TRUE) { KASSERT(fs.m == NULL, ("page still set %p at loop start", fs.m)); Index: sys/vm/vm_map.h =================================================================== --- sys/vm/vm_map.h +++ sys/vm/vm_map.h @@ -149,6 +149,10 @@ #define MAP_ENTRY_STACK_GAP_UP 0x00040000 #define MAP_ENTRY_HEADER 0x00080000 +#define MAP_ENTRY_SPLIT_BOUNDARY_MASK 0x00300000 + +#define MAP_ENTRY_SPLIT_BOUNDARY_SHIFT 20 + #ifdef _KERNEL static __inline u_char vm_map_entry_behavior(vm_map_entry_t entry) @@ -373,6 +377,9 @@ #define MAP_CREATE_STACK_GAP_UP 0x00010000 #define MAP_CREATE_STACK_GAP_DN 0x00020000 #define MAP_VN_EXEC 0x00040000 +#define MAP_SPLIT_BOUNDARY_MASK 0x00180000 + +#define MAP_SPLIT_BOUNDARY_SHIFT 19 /* * vm_fault option flags @@ -460,6 +467,8 @@ vm_offset_t, int, vm_prot_t, vm_prot_t, int); int vm_map_find_min(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, vm_offset_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int); +int vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length, + vm_offset_t max_addr, vm_offset_t alignment); int vm_map_fixed(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int); vm_offset_t vm_map_findspace(vm_map_t, vm_offset_t, vm_size_t); Index: sys/vm/vm_map.c =================================================================== --- sys/vm/vm_map.c +++ sys/vm/vm_map.c @@ -1554,13 +1554,17 @@ struct ucred *cred; vm_eflags_t protoeflags; vm_inherit_t inheritance; + u_long bdry; + u_int bidx; VM_MAP_ASSERT_LOCKED(map); KASSERT(object != kernel_object || (cow & MAP_COPY_ON_WRITE) == 0, ("vm_map_insert: kernel object and COW")); - KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0, - ("vm_map_insert: paradoxical MAP_NOFAULT request")); + KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 || + (cow & MAP_SPLIT_BOUNDARY_MASK) != 0, + ("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x", + object, cow)); KASSERT((prot & ~max) == 0, ("prot %#x is not subset of max_prot %#x", prot, max)); @@ -1615,6 +1619,17 @@ inheritance = VM_INHERIT_SHARE; else inheritance = VM_INHERIT_DEFAULT; + if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) { + /* This magically ignores index 0, for usual page size. */ + bidx = (cow & MAP_SPLIT_BOUNDARY_MASK) >> + MAP_SPLIT_BOUNDARY_SHIFT; + if (bidx >= MAXPAGESIZES) + return (KERN_INVALID_ARGUMENT); + bdry = pagesizes[bidx] - 1; + if ((start & bdry) != 0 || (end & bdry) != 0) + return (KERN_INVALID_ARGUMENT); + protoeflags |= bidx << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; + } cred = NULL; if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0) @@ -1868,8 +1883,11 @@ ("vm_map_fixed: non-NULL backing object for stack")); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); - if ((cow & MAP_CHECK_EXCL) == 0) - vm_map_delete(map, start, end); + if ((cow & MAP_CHECK_EXCL) == 0) { + result = vm_map_delete(map, start, end); + if (result != KERN_SUCCESS) + goto out; + } if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { result = vm_map_stack_locked(map, start, length, sgrowsiz, prot, max, cow); @@ -1877,6 +1895,7 @@ result = vm_map_insert(map, object, offset, start, end, prot, max, cow); } +out: vm_map_unlock(map); return (result); } @@ -1909,8 +1928,6 @@ &aslr_restarts, 0, "Number of aslr failures"); -#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) - /* * Searches for the specified amount of free space in the given map with the * specified alignment. Performs an address-ordered, first-fit search from @@ -1978,6 +1995,19 @@ } } +int +vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length, + vm_offset_t max_addr, vm_offset_t alignment) +{ + /* XXXKIB ASLR eh ? */ + *addr = vm_map_findspace(map, *addr, length); + if (*addr + length > vm_map_max(map) || + (max_addr != 0 && *addr + length > max_addr)) + return (KERN_NO_SPACE); + return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr, + alignment)); +} + /* * vm_map_find finds an unallocated region in the target address * map with the given length. The search is defined to be @@ -2115,7 +2145,9 @@ rv = KERN_INVALID_ADDRESS; goto done; } - vm_map_delete(map, *addr, *addr + length); + rv = vm_map_delete(map, *addr, *addr + length); + if (rv != KERN_SUCCESS) + goto done; } if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot, @@ -2325,31 +2357,40 @@ * the specified address; if necessary, * it splits the entry into two. */ -static inline void -vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start) +static int +vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr) { vm_map_entry_t new_entry; + int bdry_idx; if (!map->system_map) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "%s: map %p entry %p start 0x%jx", __func__, map, entry, - (uintmax_t)start); + (uintmax_t)startaddr); - if (start <= entry->start) - return; + if (startaddr <= entry->start) + return (KERN_SUCCESS); VM_MAP_ASSERT_LOCKED(map); - KASSERT(entry->end > start && entry->start < start, + KASSERT(entry->end > startaddr && entry->start < startaddr, ("%s: invalid clip of entry %p", __func__, entry)); + bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> + MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; + if (bdry_idx != 0) { + if ((startaddr & (pagesizes[bdry_idx] - 1)) != 0) + return (KERN_INVALID_ARGUMENT); + } + new_entry = vm_map_entry_clone(map, entry); /* * Split off the front portion. Insert the new entry BEFORE this one, * so that this entry has the specified starting address. */ - new_entry->end = start; + new_entry->end = startaddr; vm_map_entry_link(map, new_entry); + return (KERN_SUCCESS); } /* @@ -2359,11 +2400,12 @@ * the interior of the entry. Return entry after 'start', and in * prev_entry set the entry before 'start'. */ -static inline vm_map_entry_t +static int vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start, - vm_map_entry_t *prev_entry) + vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry) { vm_map_entry_t entry; + int rv; if (!map->system_map) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, @@ -2372,11 +2414,14 @@ if (vm_map_lookup_entry(map, start, prev_entry)) { entry = *prev_entry; - vm_map_clip_start(map, entry, start); + rv = vm_map_clip_start(map, entry, start); + if (rv != KERN_SUCCESS) + return (rv); *prev_entry = vm_map_entry_pred(entry); } else entry = vm_map_entry_succ(*prev_entry); - return (entry); + *res_entry = entry; + return (KERN_SUCCESS); } /* @@ -2386,31 +2431,41 @@ * the specified address; if necessary, * it splits the entry into two. */ -static inline void -vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end) +static int +vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr) { vm_map_entry_t new_entry; + int bdry_idx; if (!map->system_map) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "%s: map %p entry %p end 0x%jx", __func__, map, entry, - (uintmax_t)end); + (uintmax_t)endaddr); - if (end >= entry->end) - return; + if (endaddr >= entry->end) + return (KERN_SUCCESS); VM_MAP_ASSERT_LOCKED(map); - KASSERT(entry->start < end && entry->end > end, + KASSERT(entry->start < endaddr && entry->end > endaddr, ("%s: invalid clip of entry %p", __func__, entry)); + bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> + MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; + if (bdry_idx != 0) { + if ((endaddr & (pagesizes[bdry_idx] - 1)) != 0) + return (KERN_INVALID_ARGUMENT); + } + new_entry = vm_map_entry_clone(map, entry); /* * Split off the back portion. Insert the new entry AFTER this one, * so that this entry has the specified ending address. */ - new_entry->start = end; + new_entry->start = endaddr; vm_map_entry_link(map, new_entry); + + return (KERN_SUCCESS); } /* @@ -2452,12 +2507,17 @@ if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end && (entry->eflags & MAP_ENTRY_COW) == 0 && entry->object.vm_object == NULL) { - vm_map_clip_start(map, entry, start); - vm_map_clip_end(map, entry, end); + result = vm_map_clip_start(map, entry, start); + if (result != KERN_SUCCESS) + goto unlock; + result = vm_map_clip_end(map, entry, end); + if (result != KERN_SUCCESS) + goto unlock; entry->object.sub_map = submap; entry->eflags |= MAP_ENTRY_IS_SUB_MAP; result = KERN_SUCCESS; } +unlock: vm_map_unlock(map); if (result != KERN_SUCCESS) { @@ -2644,11 +2704,18 @@ * of this loop early and let the next loop simplify the entries, since * some may now be mergeable. */ - rv = KERN_SUCCESS; - vm_map_clip_start(map, first_entry, start); + rv = vm_map_clip_start(map, first_entry, start); + if (rv != KERN_SUCCESS) { + vm_map_unlock(map); + return (rv); + } for (entry = first_entry; entry->start < end; entry = vm_map_entry_succ(entry)) { - vm_map_clip_end(map, entry, end); + rv = vm_map_clip_end(map, entry, end); + if (rv != KERN_SUCCESS) { + vm_map_unlock(map); + return (rv); + } if (set_max || ((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 || @@ -2768,6 +2835,7 @@ int behav) { vm_map_entry_t entry, prev_entry; + int rv; bool modify_map; /* @@ -2813,13 +2881,22 @@ * We clip the vm_map_entry so that behavioral changes are * limited to the specified address range. */ - for (entry = vm_map_lookup_clip_start(map, start, &prev_entry); - entry->start < end; - prev_entry = entry, entry = vm_map_entry_succ(entry)) { + rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry); + if (rv != KERN_SUCCESS) { + vm_map_unlock(map); + return (vm_mmap_to_errno(rv)); + } + + for (; entry->start < end; prev_entry = entry, + entry = vm_map_entry_succ(entry)) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; - vm_map_clip_end(map, entry, end); + rv = vm_map_clip_end(map, entry, end); + if (rv != KERN_SUCCESS) { + vm_map_unlock(map); + return (vm_mmap_to_errno(rv)); + } switch (behav) { case MADV_NORMAL: @@ -2952,7 +3029,8 @@ vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_inherit_t new_inheritance) { - vm_map_entry_t entry, prev_entry; + vm_map_entry_t entry, lentry, prev_entry, start_entry; + int rv; switch (new_inheritance) { case VM_INHERIT_NONE: @@ -2967,18 +3045,37 @@ return (KERN_SUCCESS); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); - for (entry = vm_map_lookup_clip_start(map, start, &prev_entry); - entry->start < end; - prev_entry = entry, entry = vm_map_entry_succ(entry)) { - vm_map_clip_end(map, entry, end); + rv = vm_map_lookup_clip_start(map, start, &start_entry, &prev_entry); + if (rv != KERN_SUCCESS) + goto unlock; + if (vm_map_lookup_entry(map, end - 1, &lentry)) { + rv = vm_map_clip_end(map, lentry, end); + if (rv != KERN_SUCCESS) + goto unlock; + } + if (new_inheritance == VM_INHERIT_COPY) { + for (entry = start_entry; entry->start < end; + prev_entry = entry, entry = vm_map_entry_succ(entry)) { + if ((entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) + != 0) { + rv = KERN_INVALID_ARGUMENT; + goto unlock; + } + } + } + for (entry = start_entry; entry->start < end; prev_entry = entry, + entry = vm_map_entry_succ(entry)) { + KASSERT(entry->end <= end, ("non-clipped entry %p end %jx %jx", + entry, (uintmax_t)entry->end, (uintmax_t)end)); if ((entry->eflags & MAP_ENTRY_GUARD) == 0 || new_inheritance != VM_INHERIT_ZERO) entry->inheritance = new_inheritance; vm_map_try_merge_entries(map, prev_entry, entry); } vm_map_try_merge_entries(map, prev_entry, entry); +unlock: vm_map_unlock(map); - return (KERN_SUCCESS); + return (rv); } /* @@ -3077,8 +3174,13 @@ next_entry : NULL; continue; } - vm_map_clip_start(map, entry, start); - vm_map_clip_end(map, entry, end); + rv = vm_map_clip_start(map, entry, start); + if (rv != KERN_SUCCESS) + break; + rv = vm_map_clip_end(map, entry, end); + if (rv != KERN_SUCCESS) + break; + /* * Mark the entry in case the map lock is released. (See * above.) @@ -3245,8 +3347,8 @@ { vm_map_entry_t entry, first_entry, next_entry, prev_entry; vm_offset_t faddr, saved_end, saved_start; - u_long npages; - u_int last_timestamp; + u_long incr, npages; + u_int bidx, last_timestamp; int rv; bool holes_ok, need_wakeup, user_wire; vm_prot_t prot; @@ -3284,8 +3386,13 @@ next_entry : NULL; continue; } - vm_map_clip_start(map, entry, start); - vm_map_clip_end(map, entry, end); + rv = vm_map_clip_start(map, entry, start); + if (rv != KERN_SUCCESS) + goto done; + rv = vm_map_clip_end(map, entry, end); + if (rv != KERN_SUCCESS) + goto done; + /* * Mark the entry in case the map lock is released. (See * above.) @@ -3322,20 +3429,23 @@ saved_start = entry->start; saved_end = entry->end; last_timestamp = map->timestamp; + bidx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) + >> MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; + incr = pagesizes[bidx]; vm_map_busy(map); vm_map_unlock(map); - faddr = saved_start; - do { + for (faddr = saved_start; faddr < saved_end; + faddr += incr) { /* * Simulate a fault to get the page and enter * it into the physical map. */ - if ((rv = vm_fault(map, faddr, - VM_PROT_NONE, VM_FAULT_WIRE, NULL)) != - KERN_SUCCESS) + rv = vm_fault(map, faddr, VM_PROT_NONE, + VM_FAULT_WIRE, NULL); + if (rv != KERN_SUCCESS) break; - } while ((faddr += PAGE_SIZE) < saved_end); + } vm_map_lock(map); vm_map_unbusy(map); if (last_timestamp + 1 != map->timestamp) { @@ -3410,10 +3520,14 @@ * Moreover, another thread could be simultaneously * wiring this new mapping entry. Detect these cases * and skip any entries marked as in transition not by us. + * + * Another way to get an entry not marked with + * MAP_ENTRY_IN_TRANSITION is after failed clipping, + * which set rv to KERN_INVALID_ARGUMENT. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || entry->wiring_thread != curthread) { - KASSERT(holes_ok, + KASSERT(holes_ok || rv == KERN_INVALID_ARGUMENT, ("vm_map_wire: !HOLESOK and new/changed entry")); continue; } @@ -3491,6 +3605,7 @@ vm_object_t object; vm_ooffset_t offset; unsigned int last_timestamp; + int bdry_idx; boolean_t failed; vm_map_lock_read(map); @@ -3502,14 +3617,26 @@ start = first_entry->start; end = first_entry->end; } + /* - * Make a first pass to check for user-wired memory and holes. + * Make a first pass to check for user-wired memory, holes, + * and partial invalidation of largepage mappings. */ for (entry = first_entry; entry->start < end; entry = next_entry) { - if (invalidate && - (entry->eflags & MAP_ENTRY_USER_WIRED) != 0) { - vm_map_unlock_read(map); - return (KERN_INVALID_ARGUMENT); + if (invalidate) { + if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) { + vm_map_unlock_read(map); + return (KERN_INVALID_ARGUMENT); + } + bdry_idx = (entry->eflags & + MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> + MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; + if (bdry_idx != 0 && + ((start & (pagesizes[bdry_idx] - 1)) != 0 || + (end & (pagesizes[bdry_idx] - 1)) != 0)) { + vm_map_unlock_read(map); + return (KERN_INVALID_ARGUMENT); + } } next_entry = vm_map_entry_succ(entry); if (end > entry->end && @@ -3686,7 +3813,8 @@ int vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) { - vm_map_entry_t entry, next_entry; + vm_map_entry_t entry, next_entry, scratch_entry; + int rv; VM_MAP_ASSERT_LOCKED(map); @@ -3697,8 +3825,10 @@ * Find the start of the region, and clip it. * Step through all entries in this region. */ - for (entry = vm_map_lookup_clip_start(map, start, &entry); - entry->start < end; entry = next_entry) { + rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry); + if (rv != KERN_SUCCESS) + return (rv); + for (; entry->start < end; entry = next_entry) { /* * Wait for wiring or unwiring of an entry to complete. * Also wait for any system wirings to disappear on @@ -3722,13 +3852,19 @@ * Specifically, the entry may have been * clipped, merged, or deleted. */ - next_entry = vm_map_lookup_clip_start(map, - saved_start, &next_entry); + rv = vm_map_lookup_clip_start(map, saved_start, + &next_entry, &scratch_entry); + if (rv != KERN_SUCCESS) + break; } else next_entry = entry; continue; } - vm_map_clip_end(map, entry, end); + + /* XXXKIB or delete to the upper superpage boundary ? */ + rv = vm_map_clip_end(map, entry, end); + if (rv != KERN_SUCCESS) + break; next_entry = vm_map_entry_succ(entry); /* @@ -3758,7 +3894,7 @@ */ vm_map_entry_delete(map, entry); } - return (KERN_SUCCESS); + return (rv); } /* @@ -4202,7 +4338,8 @@ new_entry->end = old_entry->end; new_entry->eflags = old_entry->eflags & ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION | - MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC); + MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC | + MAP_ENTRY_SPLIT_BOUNDARY_MASK); new_entry->protection = old_entry->protection; new_entry->max_protection = old_entry->max_protection; new_entry->inheritance = VM_INHERIT_ZERO; Index: sys/vm/vm_mmap.c =================================================================== --- sys/vm/vm_mmap.c +++ sys/vm/vm_mmap.c @@ -219,14 +219,14 @@ struct file *fp; struct proc *p; off_t pos; - vm_offset_t addr; + vm_offset_t addr, orig_addr; vm_size_t len, pageoff, size; vm_prot_t cap_maxprot; int align, error, fd, flags, max_prot, prot; cap_rights_t rights; mmap_check_fp_fn check_fp_fn; - addr = mrp->mr_hint; + orig_addr = addr = mrp->mr_hint; len = mrp->mr_len; prot = mrp->mr_prot; flags = mrp->mr_flags; @@ -422,6 +422,8 @@ if (error != 0) goto done; } + if (fp->f_ops == &shm_ops && shm_largepage(fp->f_data)) + addr = orig_addr; /* This relies on VM_PROT_* matching PROT_*. */ error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, max_prot & cap_maxprot, flags, pos, td); @@ -577,6 +579,7 @@ vm_offset_t addr, end; vm_size_t pageoff; vm_map_t map; + int rv; if (size == 0) return (EINVAL); @@ -614,10 +617,10 @@ } } #endif - vm_map_delete(map, addr, end); + rv = vm_map_delete(map, addr, end); #ifdef HWPMC_HOOKS - if (__predict_false(pmc_handled)) { + if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) { /* downgrade the lock to prevent a LOR with the pmc-sx lock */ vm_map_lock_downgrade(map); if (pkm.pm_address != (uintptr_t) NULL) @@ -627,8 +630,7 @@ #endif vm_map_unlock(map); - /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ - return (0); + return (vm_mmap_to_errno(rv)); } #ifndef _SYS_SYSPROTO_H_ @@ -1104,7 +1106,14 @@ PROC_UNLOCK(proc); } #endif - return (error == KERN_SUCCESS ? 0 : ENOMEM); + switch (error) { + case KERN_SUCCESS: + return (0); + case KERN_INVALID_ARGUMENT: + return (EINVAL); + default: + return (ENOMEM); + } } #ifndef _SYS_SYSPROTO_H_ Index: sys/vm/vm_unix.c =================================================================== --- sys/vm/vm_unix.c +++ sys/vm/vm_unix.c @@ -188,7 +188,7 @@ rv = vm_map_wire_locked(map, old, new, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); if (rv != KERN_SUCCESS) - vm_map_delete(map, old, new); + (void)vm_map_delete(map, old, new); } if (rv != KERN_SUCCESS) { #ifdef RACCT Index: tests/sys/posixshm/posixshm_test.c =================================================================== --- tests/sys/posixshm/posixshm_test.c +++ tests/sys/posixshm/posixshm_test.c @@ -28,6 +28,7 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include #include @@ -956,6 +957,667 @@ close(fd); } +static int +shm_open_large(int psind, int policy, size_t sz) +{ + int error, fd; + + fd = shm_create_largepage(SHM_ANON, O_CREAT | O_RDWR, psind, policy, 0); + ATF_REQUIRE_MSG(fd >= 0, "shm_create_largepage failed; errno=%d", errno); + + error = ftruncate(fd, sz); + if (error != 0 && errno == ENOMEM) + /* XXX depends on alloc policy */ + atf_tc_skip("failed to allocate %zu-byte superpage", sz); + ATF_REQUIRE_MSG(error == 0, "ftruncate failed; errno=%d", errno); + + return (fd); +} + +static int +pagesizes(size_t ps[MAXPAGESIZES]) +{ + int pscnt; + + pscnt = getpagesizes(ps, MAXPAGESIZES); + ATF_REQUIRE_MSG(pscnt != -1, "getpagesizes failed; errno=%d", errno); + ATF_REQUIRE_MSG(ps[0] == PAGE_SIZE, "psind 0 is %zu", ps[0]); + if (pscnt == 1) + atf_tc_skip("no large page support"); + + return (pscnt); +} + +ATF_TC_WITHOUT_HEAD(largepage_basic); +ATF_TC_BODY(largepage_basic, tc) +{ + char zeroes[PAGE_SIZE]; + char *addr, *vec; + size_t ps[MAXPAGESIZES]; + int error, fd, pscnt; + + if (MAXPAGESIZES == 1) + atf_tc_pass(); + + memset(zeroes, 0, PAGE_SIZE); + + pscnt = pagesizes(ps); + for (int i = 1; i < pscnt; i++) { + fd = shm_open_large(i, SHM_LARGEPAGE_ALLOC_DEFAULT, ps[i]); + + addr = mmap(NULL, ps[i], PROT_READ | PROT_WRITE, MAP_SHARED, fd, + 0); + ATF_REQUIRE_MSG(addr != MAP_FAILED, + "mmap(%zu bytes) failed; errno=%d", ps[i], errno); + ATF_REQUIRE_MSG(((uintptr_t)addr & (ps[i] - 1)) == 0, + "mmap(%zu bytes) returned unaligned mapping; addr=%p", + ps[i], addr); + + /* Force a page fault. */ + *(volatile char *)addr = 0; + + vec = malloc(ps[i] / PAGE_SIZE); + ATF_REQUIRE(vec != NULL); + error = mincore(addr, ps[i], vec); + ATF_REQUIRE_MSG(error == 0, "mincore failed; errno=%d", errno); + + /* Verify that all pages in the run are mapped. */ + for (size_t p = 0; p < ps[i] / PAGE_SIZE; p++) { + ATF_REQUIRE_MSG((vec[p] & MINCORE_INCORE) != 0, + "page %zu is not mapped", p); + ATF_REQUIRE_MSG((vec[p] & MINCORE_PSIND(i)) != 0, + "page %zu is not in a %zu-byte superpage", + p, ps[i]); + } + + /* Validate zeroing. */ + for (size_t p = 0; p < ps[i] / PAGE_SIZE; p++) { + ATF_REQUIRE_MSG(memcmp(addr + p * PAGE_SIZE, zeroes, + PAGE_SIZE) == 0, "page %zu miscompare", p); + } + + free(vec); + ATF_REQUIRE(munmap(addr, ps[i]) == 0); + ATF_REQUIRE(close(fd) == 0); + } +} + +ATF_TC_WITHOUT_HEAD(largepage_config); +ATF_TC_BODY(largepage_config, tc) +{ + struct shm_largepage_conf lpc; + char *addr, *buf; + size_t ps[MAXPAGESIZES]; + int error, fd, pscnt; + + if (MAXPAGESIZES == 1) + atf_tc_pass(); + + pscnt = pagesizes(ps); + + fd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0); + ATF_REQUIRE_MSG(fd >= 0, "shm_open failed; error=%d", errno); + + /* + * Configure a large page policy for an object created without + * SHM_LARGEPAGE. + */ + lpc.psind = 1; + lpc.alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT; + error = ioctl(fd, FIOSSHMLPGCNF, &lpc); + ATF_REQUIRE(error != 0); + ATF_REQUIRE_MSG(errno == ENOTTY, "ioctl(FIOSSHMLPGCNF) returned %d", + errno); + ATF_REQUIRE(close(fd) == 0); + + /* + * Create a largepage object and try to use it without actually + * configuring anything. + */ + fd = syscall(SYS_shm_open2, SHM_ANON, O_CREAT | O_RDWR, 0, + SHM_LARGEPAGE, NULL); + ATF_REQUIRE_MSG(fd >= 0, "shm_open2 failed; error=%d", errno); + + error = ftruncate(fd, ps[1]); + ATF_REQUIRE(error != 0); + ATF_REQUIRE_MSG(errno == EINVAL, "ftruncate returned %d", errno); + + addr = mmap(NULL, ps[1], PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + ATF_REQUIRE(addr == MAP_FAILED); + ATF_REQUIRE_MSG(errno == EINVAL, "mmap returned %d", errno); + addr = mmap(NULL, 0, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + ATF_REQUIRE(addr == MAP_FAILED); + ATF_REQUIRE_MSG(errno == EINVAL, "mmap returned %d", errno); + + buf = calloc(1, PAGE_SIZE); + ATF_REQUIRE(buf != NULL); + ATF_REQUIRE(write(fd, buf, PAGE_SIZE) == -1); + ATF_REQUIRE_MSG(errno == EINVAL, "write returned %d", errno); + free(buf); + buf = calloc(1, ps[1]); + ATF_REQUIRE(buf != NULL); + ATF_REQUIRE(write(fd, buf, ps[1]) == -1); + ATF_REQUIRE_MSG(errno == EINVAL, "write returned %d", errno); + free(buf); + + error = posix_fallocate(fd, 0, PAGE_SIZE); + ATF_REQUIRE_MSG(error == EINVAL, "posix_fallocate returned %d", error); + + ATF_REQUIRE(close(fd) == 0); +} + +ATF_TC_WITHOUT_HEAD(largepage_mmap); +ATF_TC_BODY(largepage_mmap, tc) +{ + char *addr, *addr1, *vec; + size_t ps[MAXPAGESIZES]; + int fd, pscnt; + + if (MAXPAGESIZES == 1) + atf_tc_pass(); + + pscnt = pagesizes(ps); + for (int i = 1; i < pscnt; i++) { + fd = shm_open_large(i, SHM_LARGEPAGE_ALLOC_DEFAULT, ps[i]); + + /* For mincore(). */ + vec = malloc(ps[i]); + ATF_REQUIRE(vec != NULL); + + /* + * Wrong mapping size. + */ + addr = mmap(NULL, ps[i - 1], PROT_READ | PROT_WRITE, MAP_SHARED, + fd, 0); + ATF_REQUIRE_MSG(addr == MAP_FAILED, "mmap(%zu bytes) succeeded", + ps[i - 1]); + ATF_REQUIRE_MSG(errno == EINVAL, "mmap(%zu bytes) error=%d", + ps[i - 1], errno); + + /* + * Fixed mappings. + */ + addr = mmap(NULL, ps[i], PROT_READ | PROT_WRITE, MAP_SHARED, fd, + 0); + ATF_REQUIRE_MSG(addr != MAP_FAILED, + "mmap(%zu bytes) failed; errno=%d", ps[i], errno); + ATF_REQUIRE_MSG(((uintptr_t)addr & (ps[i] - 1)) == 0, + "mmap(%zu bytes) returned unaligned mapping; addr=%p", + ps[i], addr); + + /* Try mapping a small page with anonymous memory. */ + addr1 = mmap(addr, ps[i - 1], PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + ATF_REQUIRE_MSG(addr1 == MAP_FAILED, + "anon mmap(%zu bytes) succeeded", ps[i - 1]); + ATF_REQUIRE_MSG(errno == EINVAL, "mmap returned %d", errno); + + /* Check MAP_EXCL when creating a second largepage mapping. */ + addr1 = mmap(addr, ps[i], PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED | MAP_EXCL, fd, 0); + ATF_REQUIRE_MSG(addr1 == MAP_FAILED, + "remap(%zu bytes) succeeded", ps[i]); + /* XXX wrong errno */ + ATF_REQUIRE_MSG(errno == ENOSPC, "mmap returned %d", errno); + + /* Overwrite a largepage mapping with a lagepage mapping. */ + addr1 = mmap(addr, ps[i], PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + ATF_REQUIRE_MSG(addr1 != MAP_FAILED, + "mmap(%zu bytes) failed; errno=%d", ps[i], errno); + ATF_REQUIRE_MSG(addr == addr1, + "mmap(%zu bytes) moved from %p to %p", ps[i], addr, addr1); + + ATF_REQUIRE(munmap(addr, ps[i] == 0)); + + /* Clobber an anonymous mapping with a superpage. */ + addr1 = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_ALIGNED(30 /* XXX */), -1, 0); + ATF_REQUIRE_MSG(addr1 != MAP_FAILED, + "mmap failed; error=%d", errno); + addr = mmap(addr1, ps[i], PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + ATF_REQUIRE_MSG(addr != MAP_FAILED, + "mmap failed; error=%d", errno); + ATF_REQUIRE_MSG(addr == addr1, + "mmap disobeyed MAP_FIXED, %p %p", addr, addr1); + *(volatile char *)addr = 0; /* fault */ + ATF_REQUIRE(mincore(addr, ps[i], vec) == 0); + for (size_t p = 0; p < ps[i] / PAGE_SIZE; p++) { + /* XXX make it a subr */ + ATF_REQUIRE_MSG((vec[p] & MINCORE_INCORE) != 0, + "page %zu is not resident", p); + ATF_REQUIRE_MSG((vec[p] & MINCORE_PSIND(i)) != 0, + "page %zu is not resident", p); + } + + /* + * Copy-on-write mappings are not permitted. + */ + addr = mmap(NULL, ps[i], PROT_READ | PROT_WRITE, MAP_PRIVATE, + fd, 0); + ATF_REQUIRE_MSG(addr == MAP_FAILED, + "mmap(%zu bytes) succeeded", ps[i]); + + ATF_REQUIRE(close(fd) == 0); + } +} + +ATF_TC_WITHOUT_HEAD(largepage_munmap); +ATF_TC_BODY(largepage_munmap, tc) +{ + char *addr; + size_t ps[MAXPAGESIZES], ps1; + int fd, pscnt; + + if (MAXPAGESIZES == 1) + atf_tc_pass(); + + pscnt = pagesizes(ps); + for (int i = 1; i < pscnt; i++) { + fd = shm_open_large(i, SHM_LARGEPAGE_ALLOC_DEFAULT, ps[i]); + ps1 = ps[i - 1]; + + addr = mmap(NULL, ps[i], PROT_READ | PROT_WRITE, MAP_SHARED, fd, + 0); + ATF_REQUIRE_MSG(addr != MAP_FAILED, + "mmap(%zu bytes) failed; errno=%d", ps[i], errno); + + /* Try several unaligned munmap() requests. */ + ATF_REQUIRE(munmap(addr, ps1) != 0); + ATF_REQUIRE_MSG(errno == EINVAL, + "unexpected error %d from munmap", errno); + ATF_REQUIRE(munmap(addr, ps[i] - ps1)); + ATF_REQUIRE_MSG(errno == EINVAL, + "unexpected error %d from munmap", errno); + ATF_REQUIRE(munmap(addr + ps1, ps1) != 0); + ATF_REQUIRE_MSG(errno == EINVAL, + "unexpected error %d from munmap", errno); + ATF_REQUIRE(munmap(addr, 0)); + ATF_REQUIRE_MSG(errno == EINVAL, + "unexpected error %d from munmap", errno); + + ATF_REQUIRE(munmap(addr, ps[i]) == 0); + ATF_REQUIRE(close(fd) == 0); + } +} + +static void +largepage_madvise(char *addr, size_t sz, int advice, int error) +{ + if (error == 0) { + ATF_REQUIRE_MSG(madvise(addr, sz, advice) == 0, + "madvise(%zu, %d) failed; error=%d", sz, advice, errno); + } else { + ATF_REQUIRE_MSG(madvise(addr, sz, advice) != 0, + "madvise(%zu, %d) succeeded", sz, advice); + ATF_REQUIRE_MSG(errno == error, + "unexpected error %d from madvise(%zu, %d)", + errno, sz, advice); + } +} + +ATF_TC_WITHOUT_HEAD(largepage_madvise); +ATF_TC_BODY(largepage_madvise, tc) +{ + char *addr; + size_t ps[MAXPAGESIZES]; + int fd, pscnt; + + if (MAXPAGESIZES == 1) + atf_tc_pass(); + + pscnt = pagesizes(ps); + for (int i = 1; i < pscnt; i++) { + fd = shm_open_large(i, SHM_LARGEPAGE_ALLOC_DEFAULT, ps[i]); + addr = mmap(NULL, ps[i], PROT_READ | PROT_WRITE, MAP_SHARED, fd, + 0); + ATF_REQUIRE_MSG(addr != MAP_FAILED, + "mmap(%zu bytes) failed; error=%d", ps[i], errno); + + /* Advice that requires clipping. */ + largepage_madvise(addr, PAGE_SIZE, MADV_NORMAL, EINVAL); + largepage_madvise(addr, ps[i], MADV_NORMAL, 0); + largepage_madvise(addr, PAGE_SIZE, MADV_RANDOM, EINVAL); + largepage_madvise(addr, ps[i], MADV_RANDOM, 0); + largepage_madvise(addr, PAGE_SIZE, MADV_SEQUENTIAL, EINVAL); + largepage_madvise(addr, ps[i], MADV_SEQUENTIAL, 0); + largepage_madvise(addr, PAGE_SIZE, MADV_NOSYNC, EINVAL); + largepage_madvise(addr, ps[i], MADV_NOSYNC, 0); + largepage_madvise(addr, PAGE_SIZE, MADV_AUTOSYNC, EINVAL); + largepage_madvise(addr, ps[i], MADV_AUTOSYNC, 0); + largepage_madvise(addr, PAGE_SIZE, MADV_CORE, EINVAL); + largepage_madvise(addr, ps[i], MADV_CORE, 0); + largepage_madvise(addr, PAGE_SIZE, MADV_NOCORE, EINVAL); + largepage_madvise(addr, ps[i], MADV_NOCORE, 0); + + /* Advice that does not result in clipping. */ + largepage_madvise(addr, PAGE_SIZE, MADV_DONTNEED, 0); + largepage_madvise(addr, ps[i], MADV_DONTNEED, 0); + largepage_madvise(addr, PAGE_SIZE, MADV_WILLNEED, 0); + largepage_madvise(addr, ps[i], MADV_WILLNEED, 0); + largepage_madvise(addr, PAGE_SIZE, MADV_FREE, 0); + largepage_madvise(addr, ps[i], MADV_FREE, 0); + + ATF_REQUIRE(munmap(addr, ps[i]) == 0); + ATF_REQUIRE(close(fd) == 0); + } +} + +ATF_TC(largepage_mlock); +ATF_TC_HEAD(largepage_mlock, tc) +{ + /* Needed to set rlimit. */ + atf_tc_set_md_var(tc, "require.user", "root"); +} +ATF_TC_BODY(largepage_mlock, tc) +{ + struct rlimit rl; + char *addr; + size_t ps[MAXPAGESIZES]; + int fd, pscnt; + + if (MAXPAGESIZES == 1) + atf_tc_pass(); + + /* XXX max_user_wired also needs to be bumped */ + rl.rlim_cur = rl.rlim_max = RLIM_INFINITY; + ATF_REQUIRE_MSG(setrlimit(RLIMIT_MEMLOCK, &rl) == 0, + "setrlimit failed; error=%d", errno); + + pscnt = pagesizes(ps); + for (int i = 1; i < pscnt; i++) { + fd = shm_open_large(i, SHM_LARGEPAGE_ALLOC_DEFAULT, ps[i]); + addr = mmap(NULL, ps[i], PROT_READ | PROT_WRITE, MAP_SHARED, fd, + 0); + ATF_REQUIRE_MSG(addr != MAP_FAILED, + "mmap(%zu bytes) failed; error=%d", ps[i], errno); + + ATF_REQUIRE(mlock(addr, PAGE_SIZE) != 0); + ATF_REQUIRE_MSG(errno == EINVAL, + "unexpected error %d from mlock(%zu bytes)", errno, ps[i]); + ATF_REQUIRE(mlock(addr, ps[i] - PAGE_SIZE) != 0); + ATF_REQUIRE_MSG(errno == EINVAL, + "unexpected error %d from mlock(%zu bytes)", errno, ps[i]); + + ATF_REQUIRE_MSG(mlock(addr, ps[i]) == 0, + "mlock failed; error=%d", errno); + + ATF_REQUIRE(munmap(addr, ps[i]) == 0); + + ATF_REQUIRE(mlockall(MCL_FUTURE) == 0); + addr = mmap(NULL, ps[i], PROT_READ | PROT_WRITE, MAP_SHARED, fd, + 0); + ATF_REQUIRE_MSG(addr != MAP_FAILED, + "mmap(%zu bytes) failed; error=%d", ps[i], errno); + + ATF_REQUIRE(munmap(addr, ps[i]) == 0); + ATF_REQUIRE(close(fd) == 0); + } +} + +ATF_TC_WITHOUT_HEAD(largepage_msync); +ATF_TC_BODY(largepage_msync, tc) +{ + char *addr; + size_t ps[MAXPAGESIZES]; + int fd, pscnt; + + if (MAXPAGESIZES == 1) + atf_tc_pass(); + + pscnt = pagesizes(ps); + for (int i = 1; i < pscnt; i++) { + fd = shm_open_large(i, SHM_LARGEPAGE_ALLOC_DEFAULT, ps[i]); + addr = mmap(NULL, ps[i], PROT_READ | PROT_WRITE, MAP_SHARED, fd, + 0); + ATF_REQUIRE_MSG(addr != MAP_FAILED, + "mmap(%zu bytes) failed; error=%d", ps[i], errno); + + memset(addr, 0, ps[i]); + + /* + * "Sync" requests are no-ops for SHM objects, so small + * PAGE_SIZE-sized requests succeed. + */ + ATF_REQUIRE_MSG(msync(addr, PAGE_SIZE, MS_ASYNC) == 0, + "msync(MS_ASYNC) failed; error=%d", errno); + ATF_REQUIRE_MSG(msync(addr, ps[i], MS_ASYNC) == 0, + "msync(MS_ASYNC) failed; error=%d", errno); + ATF_REQUIRE_MSG(msync(addr, PAGE_SIZE, MS_SYNC) == 0, + "msync(MS_SYNC) failed; error=%d", errno); + ATF_REQUIRE_MSG(msync(addr, ps[i], MS_SYNC) == 0, + "msync(MS_SYNC) failed; error=%d", errno); + + ATF_REQUIRE_MSG(msync(addr, PAGE_SIZE, MS_INVALIDATE) != 0, + "msync(MS_INVALIDATE) succeeded"); + /* XXX wrong errno */ + ATF_REQUIRE_MSG(errno == EBUSY, + "unexpected error %d from msync(MS_INVALIDATE)", errno); + ATF_REQUIRE_MSG(msync(addr, ps[i], MS_INVALIDATE) == 0, + "msync(MS_INVALIDATE) failed; error=%d", errno); + memset(addr, 0, ps[i]); + + ATF_REQUIRE(munmap(addr, ps[i]) == 0); + ATF_REQUIRE(close(fd) == 0); + } +} + +static void +largepage_protect(char *addr, size_t sz, int prot, int error) +{ + if (error == 0) { + ATF_REQUIRE_MSG(mprotect(addr, sz, prot) == 0, + "mprotect(%zu, %x) failed; error=%d", sz, prot, errno); + } else { + ATF_REQUIRE_MSG(mprotect(addr, sz, prot) != 0, + "mprotect(%zu, %x) succeeded", sz, prot); + ATF_REQUIRE_MSG(errno == error, + "unexpected error %d from mprotect(%zu, %x)", + errno, sz, prot); + } +} + +ATF_TC_WITHOUT_HEAD(largepage_mprotect); +ATF_TC_BODY(largepage_mprotect, tc) +{ + char *addr, *addr1; + size_t ps[MAXPAGESIZES]; + int fd, pscnt; + + if (MAXPAGESIZES == 1) + atf_tc_pass(); + + pscnt = pagesizes(ps); + for (int i = 1; i < pscnt; i++) { + fd = shm_open_large(i, SHM_LARGEPAGE_ALLOC_DEFAULT, ps[i]); + addr = mmap(NULL, ps[i], PROT_READ | PROT_WRITE, MAP_SHARED, fd, + 0); + ATF_REQUIRE_MSG(addr != MAP_FAILED, + "mmap(%zu bytes) failed; error=%d", ps[i], errno); + + /* + * These should be no-ops from the pmap perspective since the + * page is not yet entered into the pmap. + */ + largepage_protect(addr, PAGE_SIZE, PROT_READ, EINVAL); + largepage_protect(addr, ps[i], PROT_READ, 0); + largepage_protect(addr, PAGE_SIZE, PROT_NONE, EINVAL); + largepage_protect(addr, ps[i], PROT_NONE, 0); + largepage_protect(addr, PAGE_SIZE, + PROT_READ | PROT_WRITE | PROT_EXEC, EINVAL); + largepage_protect(addr, ps[i], + PROT_READ | PROT_WRITE | PROT_EXEC, 0); + + /* Trigger creation of a mapping and try again. */ + *(volatile char *)addr = 0; + largepage_protect(addr, PAGE_SIZE, PROT_READ, EINVAL); + largepage_protect(addr, ps[i], PROT_READ, 0); + largepage_protect(addr, PAGE_SIZE, PROT_NONE, EINVAL); + largepage_protect(addr, ps[i], PROT_NONE, 0); + largepage_protect(addr, PAGE_SIZE, + PROT_READ | PROT_WRITE | PROT_EXEC, EINVAL); + largepage_protect(addr, ps[i], + PROT_READ | PROT_WRITE | PROT_EXEC, 0); + + memset(addr, 0, ps[i]); + + /* Map two contiguous large pages and merge map entries. */ + addr1 = mmap(addr + ps[i], ps[i], PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED | MAP_EXCL, fd, 0); + /* XXX can fail if no space exists, use MAP_GUARD */ + ATF_REQUIRE_MSG(addr1 != MAP_FAILED, + "mmap(%zu bytes) failed; error=%d", ps[i], errno); + + largepage_protect(addr1 - PAGE_SIZE, PAGE_SIZE * 2, + PROT_READ | PROT_WRITE, EINVAL); + largepage_protect(addr, ps[i] * 2, PROT_READ | PROT_WRITE, 0); + + memset(addr, 0, ps[i] * 2); + + ATF_REQUIRE(munmap(addr, ps[i]) == 0); + ATF_REQUIRE(munmap(addr1, ps[i]) == 0); + ATF_REQUIRE(close(fd) == 0); + } +} + +ATF_TC_WITHOUT_HEAD(largepage_minherit); +ATF_TC_BODY(largepage_minherit, tc) +{ + char *addr; + size_t ps[MAXPAGESIZES]; + pid_t child; + int fd, pscnt, status; + + if (MAXPAGESIZES == 1) + atf_tc_pass(); + + pscnt = pagesizes(ps); + for (int i = 1; i < pscnt; i++) { + fd = shm_open_large(i, SHM_LARGEPAGE_ALLOC_DEFAULT, ps[i]); + addr = mmap(NULL, ps[i], PROT_READ | PROT_WRITE, MAP_SHARED, fd, + 0); + ATF_REQUIRE_MSG(addr != MAP_FAILED, + "mmap(%zu bytes) failed; error=%d", ps[i], errno); + + ATF_REQUIRE(minherit(addr, PAGE_SIZE, INHERIT_SHARE) != 0); + + ATF_REQUIRE_MSG(minherit(addr, ps[i], INHERIT_SHARE) == 0, + "minherit(%zu bytes) failed; error=%d", ps[i], errno); + child = fork(); + ATF_REQUIRE_MSG(child != -1, "fork failed; error=%d", errno); + if (child == 0) { + char v; + + *(volatile char *)addr = 0; + if (mincore(addr, PAGE_SIZE, &v) != 0) + _exit(1); + if ((v & MINCORE_PSIND(i)) == 0) + _exit(2); + _exit(0); + } + ATF_REQUIRE_MSG(waitpid(child, &status, 0) == child, + "waitpid failed; error=%d", errno); + ATF_REQUIRE_MSG(WIFEXITED(status), + "child was killed by signal %d", WTERMSIG(status)); + ATF_REQUIRE_MSG(WEXITSTATUS(status) == 0, + "child exited with status %d", WEXITSTATUS(status)); + + ATF_REQUIRE_MSG(minherit(addr, ps[i], INHERIT_NONE) == 0, + "minherit(%zu bytes) failed; error=%d", ps[i], errno); + child = fork(); + ATF_REQUIRE_MSG(child != -1, "fork failed; error=%d", errno); + if (child == 0) { + char v; + + if (mincore(addr, PAGE_SIZE, &v) == 0) + _exit(1); + _exit(0); + } + ATF_REQUIRE_MSG(waitpid(child, &status, 0) == child, + "waitpid failed; error=%d", errno); + ATF_REQUIRE_MSG(WIFEXITED(status), + "child was killed by signal %d", WTERMSIG(status)); + ATF_REQUIRE_MSG(WEXITSTATUS(status) == 0, + "child exited with status %d", WEXITSTATUS(status)); + + /* Copy-on-write is not supported for static large pages. */ + ATF_REQUIRE_MSG(minherit(addr, ps[i], INHERIT_COPY) != 0, + "minherit(%zu bytes) succeeded", ps[i]); + + ATF_REQUIRE_MSG(minherit(addr, ps[i], INHERIT_ZERO) == 0, + "minherit(%zu bytes) failed; error=%d", ps[i], errno); + child = fork(); + ATF_REQUIRE_MSG(child != -1, "fork failed; error=%d", errno); + if (child == 0) { + char v; + + *(volatile char *)addr = 0; + if (mincore(addr, PAGE_SIZE, &v) != 0) + _exit(1); + if ((v & MINCORE_SUPER) != 0) + _exit(2); + _exit(0); + } + ATF_REQUIRE_MSG(waitpid(child, &status, 0) == child, + "waitpid failed; error=%d", errno); + ATF_REQUIRE_MSG(WIFEXITED(status), + "child was killed by signal %d", WTERMSIG(status)); + ATF_REQUIRE_MSG(WEXITSTATUS(status) == 0, + "child exited with status %d", WEXITSTATUS(status)); + + ATF_REQUIRE(munmap(addr, ps[i]) == 0); + ATF_REQUIRE(close(fd) == 0); + } +} + +ATF_TC_WITHOUT_HEAD(largepage_reopen); +ATF_TC_BODY(largepage_reopen, tc) +{ + char *addr, *vec; + size_t ps[MAXPAGESIZES]; + int fd, psind; + + if (MAXPAGESIZES == 1) + atf_tc_pass(); + + (void)pagesizes(ps); + psind = 1; + + gen_test_path(); + fd = shm_create_largepage(test_path, O_CREAT | O_RDWR, psind, + SHM_LARGEPAGE_ALLOC_DEFAULT, 0600); + if (fd < 0 && errno == EINVAL) /* XXX is it the right errno? */ + atf_tc_skip("no large page support"); + ATF_REQUIRE_MSG(fd >= 0, "shm_create_largepage failed; error=%d", errno); + + ATF_REQUIRE_MSG(ftruncate(fd, ps[psind]) == 0, + "ftruncate failed; error=%d", errno); + + ATF_REQUIRE_MSG(close(fd) == 0, "close failed; error=%d", errno); + + fd = shm_open(test_path, O_RDWR, 0); + ATF_REQUIRE_MSG(fd >= 0, "shm_open failed; error=%d", errno); + + addr = mmap(NULL, ps[psind], PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + ATF_REQUIRE_MSG(addr != MAP_FAILED, "mmap failed; error=%d", errno); + + /* Trigger a fault and mapping creation. */ + *(volatile char *)addr = 0; + + vec = malloc(ps[psind] / PAGE_SIZE); + ATF_REQUIRE(vec != NULL); + ATF_REQUIRE_MSG(mincore(addr, ps[psind], vec) == 0, + "mincore failed; error=%d", errno); + ATF_REQUIRE_MSG((vec[0] & MINCORE_PSIND(psind)) != 0, + "page not mapped into a %zu-byte superpage", ps[psind]); + + ATF_REQUIRE_MSG(shm_unlink(test_path) == 0, + "shm_unlink failed; errno=%d", errno); + ATF_REQUIRE_MSG(close(fd) == 0, + "close failed; errno=%d", errno); +} + ATF_TP_ADD_TCS(tp) { @@ -990,6 +1652,16 @@ ATF_TP_ADD_TC(tp, cloexec); ATF_TP_ADD_TC(tp, mode); ATF_TP_ADD_TC(tp, fallocate); + ATF_TP_ADD_TC(tp, largepage_basic); + ATF_TP_ADD_TC(tp, largepage_config); + ATF_TP_ADD_TC(tp, largepage_mmap); + ATF_TP_ADD_TC(tp, largepage_munmap); + ATF_TP_ADD_TC(tp, largepage_madvise); + ATF_TP_ADD_TC(tp, largepage_mlock); + ATF_TP_ADD_TC(tp, largepage_msync); + ATF_TP_ADD_TC(tp, largepage_mprotect); + ATF_TP_ADD_TC(tp, largepage_minherit); + ATF_TP_ADD_TC(tp, largepage_reopen); return (atf_no_error()); } Index: usr.bin/posixshmcontrol/posixshmcontrol.c =================================================================== --- usr.bin/posixshmcontrol/posixshmcontrol.c +++ usr.bin/posixshmcontrol/posixshmcontrol.c @@ -30,8 +30,10 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include +#include #include #include #include @@ -50,7 +52,7 @@ { fprintf(stderr, "Usage:\n" - "posixshmcontrol create [-m ] ...\n" + "posixshmcontrol create [-m ] [-l ] ...\n" "posixshmcontrol rm ...\n" "posixshmcontrol ls [-h] [-n]\n" "posixshmcontrol dump ...\n" @@ -59,14 +61,23 @@ } static int -create_one_shm(const char *path, long mode) +create_one_shm(const char *path, long mode, int idx) { int fd; - fd = shm_open(path, O_RDWR | O_CREAT, mode); - if (fd == -1) { - warn("create %s", path); - return (1); + if (idx == -1) { + fd = shm_open(path, O_RDWR | O_CREAT, mode); + if (fd == -1) { + warn("create %s", path); + return (1); + } + } else { + fd = shm_create_largepage(path, O_RDWR, idx, + SHM_LARGEPAGE_ALLOC_DEFAULT, mode); + if (fd == -1) { + warn("shm_create_largepage %s psind %d", path, idx); + return (1); + } } close(fd); return (0); @@ -76,20 +87,60 @@ create_shm(int argc, char **argv) { char *end; + size_t *pagesizes; long mode; - int c, i, ret, ret1; + uint64_t pgsz; + int c, i, idx, pn, ret, ret1; + bool printed; mode = 0600; - while ((c = getopt(argc, argv, "m:")) != -1) { + idx = -1; + while ((c = getopt(argc, argv, "l:m:")) != -1) { switch (c) { case 'm': errno = 0; mode = strtol(optarg, &end, 0); if (mode == 0 && errno != 0) - err(1, "mode:"); + err(1, "mode"); if (*end != '\0') errx(1, "non-integer mode"); break; + case 'l': + if (expand_number(optarg, &pgsz) == -1) + err(1, "size"); + pn = getpagesizes(NULL, 0); + if (pn == -1) + err(1, "getpagesizes"); + pagesizes = malloc(sizeof(size_t) * pn); + if (pagesizes == NULL) + err(1, "malloc"); + if (getpagesizes(pagesizes, pn) == -1) + err(1, "gtpagesizes"); + for (idx = 0; idx < pn; idx++) { + if (pagesizes[idx] == pgsz) + break; + } + if (idx == pn) { + fprintf(stderr, + "pagesize should be superpagesize, supported sizes:"); + printed = false; + for (i = 0; i < pn; i++) { + if (pagesizes[i] == 0 || + pagesizes[i] == (size_t) + getpagesize()) + continue; + printed = true; + fprintf(stderr, " %zu", pagesizes[i]); + } + if (!printed) + fprintf(stderr, " none"); + fprintf(stderr, "\n"); + exit(1); + } + if (pgsz == (uint64_t)getpagesize()) + errx(1, "pagesize should be large"); + free(pagesizes); + break; case '?': default: usage(); @@ -101,7 +152,7 @@ argv += optind; ret = 0; for (i = 0; i < argc; i++) { - ret1 = create_one_shm(argv[i], mode); + ret1 = create_one_shm(argv[i], mode, idx); if (ret1 != 0 && ret == 0) ret = ret1; } @@ -349,6 +400,9 @@ (long)st.st_ctim.tv_nsec); printf("birth\t%ld.%09ld\n", (long)st.st_birthtim.tv_sec, (long)st.st_birthtim.tv_nsec); + if (st.st_blocks != 0) + printf("pagesz\t%jd\n", roundup((uintmax_t)st.st_size, + PAGE_SIZE) / st.st_blocks); } close(fd); return (ret);