Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F135653591
D24652.id74286.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
71 KB
Referenced Files
None
Subscribers
None
D24652.id74286.diff
View Options
Index: TODO
===================================================================
--- /dev/null
+++ TODO
@@ -0,0 +1,5 @@
+- per-user limit on the total superpages allocations
+- man pages
+- export shm_open2(2) from libc.so ?
+- make pmap_superpagesizes[] per-pmap ?
+- more test programs
Index: lib/libc/sys/shm_open.c
===================================================================
--- lib/libc/sys/shm_open.c
+++ lib/libc/sys/shm_open.c
@@ -31,14 +31,17 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/filio.h>
#include <sys/mman.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
+#include <stdlib.h>
#include <stdio.h>
#include <string.h>
+#include <unistd.h>
#include "libc_private.h"
@@ -54,6 +57,27 @@
return (__sys_shm_open2(path, flags | O_CLOEXEC, mode, 0, NULL));
}
+#define K(x) ((size_t)(x) * 1024)
+#define M(x) (K(x) * 1024)
+#define G(x) (M(x) * 1024)
+static const struct {
+ int mask;
+ size_t pgsize;
+} mfd_huge_sizes[] = {
+ { .mask = MFD_HUGE_64KB, .pgsize = K(64) },
+ { .mask = MFD_HUGE_512KB, .pgsize = K(512) },
+ { .mask = MFD_HUGE_1MB, .pgsize = M(1) },
+ { .mask = MFD_HUGE_2MB, .pgsize = M(2) },
+ { .mask = MFD_HUGE_8MB, .pgsize = M(8) },
+ { .mask = MFD_HUGE_16MB, .pgsize = M(16) },
+ { .mask = MFD_HUGE_32MB, .pgsize = M(32) },
+ { .mask = MFD_HUGE_256MB, .pgsize = M(256) },
+ { .mask = MFD_HUGE_512MB, .pgsize = M(512) },
+ { .mask = MFD_HUGE_1GB, .pgsize = G(1) },
+ { .mask = MFD_HUGE_2GB, .pgsize = G(2) },
+ { .mask = MFD_HUGE_16GB, .pgsize = G(16) },
+};
+
/*
* The path argument is passed to the kernel, but the kernel doesn't currently
* do anything with it. Linux exposes it in linprocfs for debugging purposes
@@ -63,8 +87,9 @@
memfd_create(const char *name, unsigned int flags)
{
char memfd_name[NAME_MAX + 1];
- size_t namelen;
- int oflags, shmflags;
+ size_t namelen, *pgs;
+ struct shm_largepage_conf slc;
+ int error, fd, i, npgs, oflags, pgidx, saved_errno, shmflags;
if (name == NULL)
return (EBADF);
@@ -75,11 +100,9 @@
MFD_HUGE_MASK)) != 0)
return (EINVAL);
/* Size specified but no HUGETLB. */
- if ((flags & MFD_HUGE_MASK) != 0 && (flags & MFD_HUGETLB) == 0)
+ if (((flags & MFD_HUGE_MASK) != 0 && (flags & MFD_HUGETLB) == 0) ||
+ __bitcount(flags & MFD_HUGE_MASK) > 1)
return (EINVAL);
- /* We don't actually support HUGETLB. */
- if ((flags & MFD_HUGETLB) != 0)
- return (ENOSYS);
/* We've already validated that we're sufficiently sized. */
snprintf(memfd_name, NAME_MAX + 1, "%s%s", MEMFD_NAME_PREFIX, name);
@@ -89,5 +112,57 @@
oflags |= O_CLOEXEC;
if ((flags & MFD_ALLOW_SEALING) != 0)
shmflags |= SHM_ALLOW_SEALING;
- return (__sys_shm_open2(SHM_ANON, oflags, 0, shmflags, memfd_name));
+ if ((flags & MFD_HUGETLB) == 0)
+ shmflags |= SHM_LARGEPAGE;
+ fd = __sys_shm_open2(SHM_ANON, oflags, 0, shmflags, memfd_name);
+ if (fd == -1 || (flags & MFD_HUGETLB) == 0)
+ return (fd);
+
+ pgs = NULL;
+ npgs = getpagesizes(NULL, 0);
+ if (npgs == -1)
+ goto clean;
+ pgs = calloc(npgs, sizeof(size_t));
+ if (pgs == NULL)
+ goto clean;
+ error = getpagesizes(pgs, npgs);
+ if (error == -1)
+ goto clean;
+ if ((flags & MFD_HUGE_MASK) == 0) {
+ if (npgs == 1) {
+ errno = EOPNOTSUPP;
+ goto clean;
+ }
+ pgidx = 1;
+ } else {
+ for (i = 0; i < nitems(mfd_huge_sizes); i++) {
+ if (mfd_huge_sizes[i].mask == (flags & MFD_HUGE_MASK))
+ break;
+ }
+ for (pgidx = 0; pgidx < npgs; pgidx++) {
+ if (mfd_huge_sizes[i].pgsize == pgs[pgidx])
+ break;
+ }
+ if (pgidx == npgs) {
+ errno = EOPNOTSUPP;
+ goto clean;
+ }
+ }
+ free(pgs);
+ pgs = NULL;
+
+ memset(&slc, 0, sizeof(slc));
+ slc.psind = pgidx;
+ slc.alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT;
+ error = ioctl(fd, FIOSHMLPGCNF, &slc);
+ if (error == -1)
+ goto clean;
+ return (fd);
+
+clean:
+ saved_errno = errno;
+ close(fd);
+ free(pgs);
+ errno = saved_errno;
+ return (-1);
}
Index: sys/amd64/amd64/pmap.c
===================================================================
--- sys/amd64/amd64/pmap.c
+++ sys/amd64/amd64/pmap.c
@@ -48,7 +48,7 @@
*/
/*-
* Copyright (c) 2003 Networks Associates Technology, Inc.
- * Copyright (c) 2014-2019 The FreeBSD Foundation
+ * Copyright (c) 2014-2020 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Jake Burkholder,
@@ -1334,6 +1334,8 @@
pdpe = pmap_pdpe(pmap, va);
if (pdpe == NULL || (*pdpe & PG_V) == 0)
return (NULL);
+ KASSERT((*pdpe & PG_PS) == 0,
+ ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va));
return (pmap_pdpe_to_pde(pdpe, va));
}
@@ -2141,6 +2143,11 @@
KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
("pmap_init: can't assign to pagesizes[1]"));
pagesizes[1] = NBPDR;
+ if ((amd_feature & AMDID_PAGE1GB) != 0) {
+ KASSERT(MAXPAGESIZES > 1 && pagesizes[2] == 0,
+ ("pmap_init: can't assign to pagesizes[2]"));
+ pagesizes[2] = NBPDP;
+ }
}
/*
@@ -5445,6 +5452,7 @@
pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
struct rwlock *lock;
+ vm_page_t mt;
vm_offset_t va_next;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
@@ -5497,13 +5505,28 @@
}
pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
+ va_next = (sva + NBPDP) & ~PDPMASK;
if ((*pdpe & PG_V) == 0) {
- va_next = (sva + NBPDP) & ~PDPMASK;
if (va_next < sva)
va_next = eva;
continue;
}
+ KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva,
+ ("pmap_remove of non-transient 1G page "
+ "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
+ *pdpe, sva, eva, va_next));
+ if ((*pdpe & PG_PS) != 0) {
+ MPASS(pmap != kernel_pmap); /* XXXKIB */
+ MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0);
+ anyvalid = 1;
+ *pdpe = 0;
+ pmap_resident_count_dec(pmap, NBPDP / PAGE_SIZE);
+ mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME);
+ pmap_unwire_ptp(pmap, sva, mt, &free);
+ continue;
+ }
+
/*
* Calculate index for next page table.
*/
@@ -5719,11 +5742,13 @@
void
pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
{
+ vm_page_t m;
vm_offset_t va_next;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t ptpaddr, *pde;
pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
+ pt_entry_t obits, pbits;
boolean_t anychanged;
KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
@@ -5774,13 +5799,36 @@
}
pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
+ va_next = (sva + NBPDP) & ~PDPMASK;
if ((*pdpe & PG_V) == 0) {
- va_next = (sva + NBPDP) & ~PDPMASK;
if (va_next < sva)
va_next = eva;
continue;
}
+ KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva,
+ ("pmap_remove of non-transient 1G page "
+ "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
+ *pdpe, sva, eva, va_next));
+ if ((*pdpe & PG_PS) != 0) {
+retry_pdpe:
+ obits = pbits = *pdpe;
+ MPASS((pbits & (PG_MANAGED | PG_G)) == 0);
+ MPASS(pmap != kernel_pmap); /* XXXKIB */
+ if ((prot & VM_PROT_WRITE) == 0)
+ pbits &= ~(PG_RW | PG_M);
+ if ((prot & VM_PROT_EXECUTE) == 0)
+ pbits |= pg_nx;
+
+ if (pbits != obits) {
+ if (!atomic_cmpset_long(pdpe, obits, pbits))
+ /* PG_PS cannot be cleared under us, */
+ goto retry_pdpe;
+ anychanged = TRUE;
+ }
+ continue;
+ }
+
va_next = (sva + NBPDR) & ~PDRMASK;
if (va_next < sva)
va_next = eva;
@@ -5823,9 +5871,6 @@
for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
sva += PAGE_SIZE) {
- pt_entry_t obits, pbits;
- vm_page_t m;
-
retry:
obits = pbits = *pte;
if ((pbits & PG_V) == 0)
@@ -6000,6 +6045,115 @@
}
#endif /* VM_NRESERVLEVEL > 0 */
+static int
+pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags,
+ int psind)
+{
+ vm_page_t mp;
+ pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V;
+ vm_pindex_t ptepindex;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ KASSERT(psind > 0 && psind < MAXPAGESIZES,
+ ("psind %d unexpected", psind));
+ KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0,
+ ("unaligned phys address %#lx newpte %#lx psind %d",
+ newpte & PG_FRAME, newpte, psind));
+ KASSERT((va & (pagesizes[psind] - 1)) == 0,
+ ("unaligned va %#lx psind %d", va, psind));
+ KASSERT(va < VM_MAXUSER_ADDRESS,
+ ("kernel mode non-transparent superpage")); /* XXXKIB */
+ KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS,
+ ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */
+
+ PG_V = pmap_valid_bit(pmap);
+
+restart:
+ pten = newpte;
+ if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86)
+ pten |= pmap_pkru_get(pmap, va);
+
+ ptepindex = pmap_pde_pindex(va);
+
+ if (psind == 2) { /* 1G */
+ if (!pmap_pkru_same(pmap, va, va + NBPDP))
+ return (KERN_PROTECTION_FAILURE);
+ pml4e = pmap_pml4e(pmap, va);
+ if ((*pml4e & PG_V) == 0) {
+ mp = _pmap_allocpte(pmap, NUPDE + NUPDPE +
+ ((ptepindex - NUPDE) >> NPML4EPGSHIFT), NULL);
+ if (mp == NULL) {
+ if ((flags & PMAP_ENTER_NOSLEEP) != 0)
+ return (KERN_RESOURCE_SHORTAGE);
+ PMAP_UNLOCK(pmap);
+ vm_wait(NULL);
+ PMAP_LOCK(pmap);
+
+ /*
+ * Restart at least to recalcuate the pkru
+ * key. Our caller must keep the map locked
+ * so no paging structure can be validated
+ * under us.
+ */
+ goto restart;
+ }
+ } else {
+ mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME);
+ mp->ref_count++;
+ }
+ pdpe = pmap_pdpe(pmap, va);
+ KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va));
+ origpte = *pdpe;
+ KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 &&
+ (origpte & PG_FRAME) == (newpte & PG_FRAME)),
+ ("va %#lx changing 1G phys page pdpe %#lx newpte %#lx",
+ va, origpte, newpte));
+ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
+ pmap->pm_stats.wired_count += NBPDP / PAGE_SIZE;
+ else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
+ pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE;
+ *pdpe = newpte;
+ } else /* (psind == 1) */ { /* 2M */
+ if (!pmap_pkru_same(pmap, va, va + NBPDR))
+ return (KERN_PROTECTION_FAILURE);
+ pde = pmap_pde(pmap, va);
+ if (pde == NULL) {
+ mp = _pmap_allocpte(pmap, NUPDE +
+ (ptepindex >> NPDPEPGSHIFT), NULL);
+ if (mp == NULL) {
+ if ((flags & PMAP_ENTER_NOSLEEP) != 0)
+ return (KERN_RESOURCE_SHORTAGE);
+ PMAP_UNLOCK(pmap);
+ vm_wait(NULL);
+ PMAP_LOCK(pmap);
+ goto restart;
+ }
+ pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
+ pde = &pde[pmap_pde_index(va)];
+ } else {
+ pdpe = pmap_pdpe(pmap, va);
+ MPASS(pdpe != NULL && (*pdpe & PG_V) != 0);
+ mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
+ mp->ref_count++;
+ }
+ KASSERT(pde != NULL, ("va %#lx lost pde", va));
+ origpte = *pde;
+ KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 &&
+ (origpte & PG_FRAME) == (newpte & PG_FRAME)),
+ ("va %#lx changing 2M phys page pde %#lx newpte %#lx",
+ va, origpte, newpte));
+ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
+ pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
+ else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
+ pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
+ *pde = newpte;
+ }
+ if ((origpte & PG_V) == 0)
+ pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
+
+ return (KERN_SUCCESS);
+}
+
/*
* Insert the given physical page (p) at
* the specified virtual address (v) in the
@@ -6079,6 +6233,13 @@
lock = NULL;
PMAP_LOCK(pmap);
+ if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
+ KASSERT((m->oflags & VPO_UNMANAGED) != 0,
+ ("managed largepage va %#lx flags %#x", va, flags));
+ rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags,
+ psind);
+ goto out;
+ }
if (psind == 1) {
/* Assert the required virtual and physical alignment. */
KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
@@ -6764,9 +6925,10 @@
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t *pde;
- pt_entry_t *pte, PG_V;
+ pt_entry_t *pte, PG_V, PG_G;
PG_V = pmap_valid_bit(pmap);
+ PG_G = pmap_global_bit(pmap);
PMAP_LOCK(pmap);
for (; sva < eva; sva = va_next) {
pml4e = pmap_pml4e(pmap, sva);
@@ -6783,6 +6945,18 @@
va_next = eva;
continue;
}
+ KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva,
+ ("pmap_unwire of non-transient 1G page "
+ "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
+ *pdpe, sva, eva, va_next));
+ if ((*pdpe & PG_PS) != 0) {
+ MPASS(pmap != kernel_pmap); /* XXXKIB */
+ MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0);
+ atomic_clear_long(pdpe, PG_W);
+ pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE;
+ continue;
+ }
+
va_next = (sva + NBPDR) & ~PDRMASK;
if (va_next < sva)
va_next = eva;
@@ -6899,6 +7073,12 @@
}
va_next = (addr + NBPDR) & ~PDRMASK;
+ KASSERT((*pdpe & PG_PS) == 0 || va_next <= end_addr,
+ ("pmap_copy of partial non-transient 1G page "
+ "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
+ *pdpe, addr, end_addr, va_next));
+ if ((*pdpe & PG_PS) != 0)
+ continue;
if (va_next < addr)
va_next = end_addr;
@@ -7955,6 +8135,12 @@
va_next = (sva + NBPDR) & ~PDRMASK;
if (va_next < sva)
va_next = eva;
+ KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva,
+ ("pmap_advise of non-transient 1G page "
+ "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
+ *pdpe, sva, eva, va_next));
+ if ((*pdpe & PG_PS) != 0)
+ continue;
pde = pmap_pdpe_to_pde(pdpe, sva);
oldpde = *pde;
if ((oldpde & PG_V) == 0)
@@ -8714,6 +8900,7 @@
int
pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
{
+ pdp_entry_t *pdpe;
pd_entry_t *pdep;
pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
vm_paddr_t pa;
@@ -8725,23 +8912,32 @@
PG_RW = pmap_rw_bit(pmap);
PMAP_LOCK(pmap);
- pdep = pmap_pde(pmap, addr);
- if (pdep != NULL && (*pdep & PG_V)) {
- if (*pdep & PG_PS) {
- pte = *pdep;
- /* Compute the physical address of the 4KB page. */
- pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
+ pte = 0;
+ pa = 0;
+ val = 0;
+ pdpe = pmap_pdpe(pmap, addr);
+ if ((*pdpe & PG_V) != 0) {
+ if ((*pdpe & PG_PS) != 0) {
+ pte = *pdpe;
+ pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) &
PG_FRAME;
val = MINCORE_SUPER;
} else {
- pte = *pmap_pde_to_pte(pdep, addr);
- pa = pte & PG_FRAME;
- val = 0;
+ pdep = pmap_pde(pmap, addr);
+ if (pdep != NULL && (*pdep & PG_V) != 0) {
+ if ((*pdep & PG_PS) != 0) {
+ pte = *pdep;
+ /* Compute the physical address of the 4KB page. */
+ pa = ((pte & PG_PS_FRAME) | (addr &
+ PDRMASK)) & PG_FRAME;
+ val = MINCORE_SUPER;
+ } else {
+ pte = *pmap_pde_to_pte(pdep, addr);
+ pa = pte & PG_FRAME;
+ val = 0;
+ }
+ }
}
- } else {
- pte = 0;
- pa = 0;
- val = 0;
}
if ((pte & PG_V) != 0) {
val |= MINCORE_INCORE;
Index: sys/dev/ksyms/ksyms.c
===================================================================
--- sys/dev/ksyms/ksyms.c
+++ sys/dev/ksyms/ksyms.c
@@ -41,6 +41,7 @@
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/uio.h>
@@ -51,6 +52,8 @@
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
#include "linker_if.h"
@@ -442,8 +445,8 @@
ksyms_size_calc(&ts);
elfsz = sizeof(struct ksyms_hdr) + ts.ts_symsz + ts.ts_strsz;
- object = vm_object_allocate(OBJT_PHYS,
- OFF_TO_IDX(round_page(elfsz)));
+ object = vm_pager_allocate(OBJT_PHYS, NULL, round_page(elfsz),
+ VM_PROT_ALL, 0, td->td_ucred);
sc->sc_obj = object;
sc->sc_objsz = elfsz;
Index: sys/dev/xen/gntdev/gntdev.c
===================================================================
--- sys/dev/xen/gntdev/gntdev.c
+++ sys/dev/xen/gntdev/gntdev.c
@@ -1068,7 +1068,8 @@
vm_object_t mem_obj;
struct gntdev_gref *gref;
- mem_obj = vm_object_allocate(OBJT_PHYS, size);
+ mem_obj = vm_pager_allocate(OBJT_PHYS, NULL, size, VM_PROT_ALL, 0,
+ curthread->td_ucred);
if (mem_obj == NULL)
return (ENOMEM);
Index: sys/kern/kern_umtx.c
===================================================================
--- sys/kern/kern_umtx.c
+++ sys/kern/kern_umtx.c
@@ -3933,7 +3933,7 @@
reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
reg->ushm_refcnt = 1;
bcopy(key, ®->ushm_key, sizeof(*key));
- reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR);
+ reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR, false);
reg->ushm_cred = crhold(cred);
error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
if (error != 0) {
Index: sys/kern/link_elf.c
===================================================================
--- sys/kern/link_elf.c
+++ sys/kern/link_elf.c
@@ -1107,7 +1107,8 @@
ef = (elf_file_t) lf;
#ifdef SPARSE_MAPPING
- ef->object = vm_object_allocate(OBJT_PHYS, atop(mapsize));
+ ef->object = vm_pager_allocate(OBJT_PHYS, NULL, mapsize, VM_PROT_ALL,
+ 0, thread0.td_ucred);
if (ef->object == NULL) {
error = ENOMEM;
goto out;
Index: sys/kern/link_elf_obj.c
===================================================================
--- sys/kern/link_elf_obj.c
+++ sys/kern/link_elf_obj.c
@@ -34,16 +34,17 @@
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
+#include <sys/linker.h>
#include <sys/mutex.h>
#include <sys/mount.h>
-#include <sys/proc.h>
#include <sys/namei.h>
-#include <sys/fcntl.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
#include <sys/vnode.h>
-#include <sys/linker.h>
#include <machine/elf.h>
@@ -53,11 +54,13 @@
#include <vm/vm.h>
#include <vm/vm_param.h>
-#include <vm/vm_object.h>
-#include <vm/vm_kern.h>
-#include <vm/vm_extern.h>
#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
#include <sys/link_elf.h>
@@ -905,7 +908,8 @@
* This stuff needs to be in a single chunk so that profiling etc
* can get the bounds and gdb can associate offsets with modules
*/
- ef->object = vm_object_allocate(OBJT_PHYS, atop(round_page(mapsize)));
+ ef->object = vm_pager_allocate(OBJT_PHYS, NULL, round_page(mapsize),
+ VM_PROT_ALL, 0, thread0.td_ucred);
if (ef->object == NULL) {
error = ENOMEM;
goto out;
Index: sys/kern/uipc_shm.c
===================================================================
--- sys/kern/uipc_shm.c
+++ sys/kern/uipc_shm.c
@@ -2,6 +2,7 @@
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson
+ * Copyright 2020 The FreeBSD Foundation
* All rights reserved.
*
* Portions of this software were developed by BAE Systems, the University of
@@ -9,6 +10,9 @@
* contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
* Computing (TC) research program.
*
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -159,11 +163,40 @@
.fo_get_seals = shm_get_seals,
.fo_add_seals = shm_add_seals,
.fo_fallocate = shm_fallocate,
- .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
+ .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE,
+};
+
+struct fileops shm_ops_large = {
+ .fo_read = shm_read,
+ .fo_write = shm_write,
+ .fo_truncate = shm_truncate,
+ .fo_ioctl = shm_ioctl,
+ .fo_poll = invfo_poll,
+ .fo_kqfilter = invfo_kqfilter,
+ .fo_stat = shm_stat,
+ .fo_close = shm_close,
+ .fo_chmod = shm_chmod,
+ .fo_chown = shm_chown,
+ .fo_sendfile = vn_sendfile,
+ .fo_seek = shm_seek,
+ .fo_fill_kinfo = shm_fill_kinfo,
+ .fo_mmap = shm_mmap,
+ .fo_get_seals = shm_get_seals,
+ .fo_add_seals = shm_add_seals,
+ .fo_fallocate = shm_fallocate,
+ .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE,
};
FEATURE(posix_shm, "POSIX shared memory");
+static SYSCTL_NODE(_vm, OID_AUTO, largepages, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+ "");
+
+static int largepage_reclaim_tries = 1;
+SYSCTL_INT(_vm_largepages, OID_AUTO, reclaim_tries,
+ CTLFLAG_RWTUN, &largepage_reclaim_tries, 0,
+ "Number of contig reclaims before giving up for default alloc policy");
+
static int
uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
{
@@ -242,6 +275,84 @@
return (error);
}
+static u_long count_largepages[MAXPAGESIZES];
+
+static int
+shm_largepage_phys_populate(vm_object_t object, vm_pindex_t pidx,
+ int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
+{
+ vm_page_t m;
+ int psind;
+
+ psind = object->un_pager.phys.data_val;
+ if (psind == 0 || pidx >= object->size)
+ return (VM_PAGER_FAIL);
+ *first = rounddown(pidx, pagesizes[psind] / PAGE_SIZE);
+
+ /*
+ * We only busy the first page in the superpage run. It is
+ * useless to busy whole run since we only remove full
+ * superpage, and it takes too long to busy e.g. 512 * 512 ==
+ * 262144 pages constituing 1G amd64 superage.
+ */
+ m = vm_page_grab(object, *first, VM_ALLOC_NORMAL);
+
+ *last = roundup(pidx, pagesizes[psind] / PAGE_SIZE);
+ return (VM_PAGER_OK);
+}
+
+static boolean_t
+shm_largepage_phys_haspage(vm_object_t object, vm_pindex_t pindex,
+ int *before, int *after)
+{
+ int psind;
+
+ psind = object->un_pager.phys.data_val;
+ if (psind == 0 || pindex >= object->size)
+ return (FALSE);
+ if (before != NULL) {
+ *before = pindex - rounddown(pindex, pagesizes[psind] /
+ PAGE_SIZE);
+ }
+ if (after != NULL) {
+ *after = roundup(pindex, pagesizes[psind] / PAGE_SIZE) -
+ pindex;
+ }
+ return (TRUE);
+}
+
+static void
+shm_largepage_phys_ctor(vm_object_t object, vm_prot_t prot,
+ vm_ooffset_t foff, struct ucred *cred)
+{
+}
+
+static void
+shm_largepage_phys_dtor(vm_object_t object)
+{
+ int psind;
+
+ psind = object->un_pager.phys.data_val;
+ if (psind != 0) {
+ atomic_subtract_long(&count_largepages[psind],
+ object->size / (pagesizes[psind] / PAGE_SIZE));
+ }
+}
+
+static struct phys_pager_ops shm_largepage_phys_ops = {
+ .phys_pg_populate = shm_largepage_phys_populate,
+ .phys_pg_haspage = shm_largepage_phys_haspage,
+ .phys_pg_ctor = shm_largepage_phys_ctor,
+ .phys_pg_dtor = shm_largepage_phys_dtor,
+};
+
+static inline bool
+shm_largepage(struct shmfd *shmfd)
+{
+ return (shmfd->shm_object->type == OBJT_PHYS &&
+ shmfd->shm_object->un_pager.phys.ops == &shm_largepage_phys_ops);
+}
+
static int
shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
{
@@ -321,6 +432,8 @@
if (error)
return (error);
#endif
+ if (shm_largepage(shmfd) && shmfd->shm_lp_psind == 0)
+ return (EINVAL);
foffset_lock_uio(fp, uio, flags);
if (uio->uio_resid > OFF_MAX - uio->uio_offset) {
/*
@@ -385,7 +498,11 @@
shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
struct thread *td)
{
+ struct shmfd *shmfd;
+ struct shm_largepage_conf *conf;
+ void *rl_cookie;
+ shmfd = fp->f_data;
switch (com) {
case FIONBIO:
case FIOASYNC:
@@ -394,6 +511,28 @@
* just like it would on an unlinked regular file
*/
return (0);
+ case FIOSHMLPGCNF:
+ if (!shm_largepage(shmfd))
+ return (ENOTTY);
+ conf = data;
+ if (shmfd->shm_lp_psind != 0 &&
+ conf->psind != shmfd->shm_lp_psind)
+ return (EINVAL);
+ if (conf->psind <= 0 || conf->psind >= MAXPAGESIZES ||
+ pagesizes[conf->psind] == 0)
+ return (EINVAL);
+ if (conf->alloc_policy != SHM_LARGEPAGE_ALLOC_DEFAULT &&
+ conf->alloc_policy != SHM_LARGEPAGE_ALLOC_NOWAIT &&
+ conf->alloc_policy != SHM_LARGEPAGE_ALLOC_HARD)
+ return (EINVAL);
+
+ rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
+ &shmfd->shm_mtx);
+ shmfd->shm_lp_psind = conf->psind;
+ shmfd->shm_lp_alloc_policy = conf->alloc_policy;
+ shmfd->shm_object->un_pager.phys.data_val = conf->psind;
+ rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
+ return (0);
default:
return (ENOTTY);
}
@@ -436,6 +575,8 @@
sb->st_dev = shm_dev_ino;
sb->st_ino = shmfd->shm_ino;
sb->st_nlink = shmfd->shm_object->ref_count;
+ sb->st_blocks = shmfd->shm_object->size /
+ (pagesizes[shmfd->shm_lp_psind] >> PAGE_SHIFT);
return (0);
}
@@ -592,6 +733,90 @@
return (0);
}
+static int
+shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie)
+{
+ vm_object_t object;
+ vm_page_t m;
+ vm_pindex_t newobjsz, oldobjsz;
+ int aflags, error, i, try;
+
+ KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
+ object = shmfd->shm_object;
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ rangelock_cookie_assert(rl_cookie, RA_WLOCKED);
+
+ oldobjsz = object->size;
+ newobjsz = OFF_TO_IDX(length);
+ if (length == shmfd->shm_size)
+ return (0);
+ if (shmfd->shm_lp_psind == 0 && length != 0)
+ return (EINVAL);
+ if ((length & (pagesizes[shmfd->shm_lp_psind] - 1)) != 0)
+ return (EINVAL);
+
+ if (length < shmfd->shm_size) {
+ if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0)
+ return (EPERM);
+ if (shmfd->shm_kmappings > 0)
+ return (EBUSY);
+ return (ENOTSUP); /* Pages are unmanaged. */
+#if 0
+ vm_object_page_remove(object, newobjsz, oldobjsz, 0);
+ object->size = newobjsz;
+ shmfd->shm_size = length;
+ return (0);
+#endif
+ }
+
+ aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO;
+ if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT)
+ aflags |= VM_ALLOC_WAITFAIL;
+ try = 0;
+
+ /*
+ * Extend shmfd and object, keeping all already fully
+ * allocated large pages intact even on error, because dropped
+ * object lock might allowed mapping of them.
+ */
+ while (object->size < newobjsz) {
+ m = vm_page_alloc_contig(object, object->size, aflags,
+ pagesizes[shmfd->shm_lp_psind] / PAGE_SIZE, 0, ~0,
+ pagesizes[shmfd->shm_lp_psind], 0,
+ VM_MEMATTR_DEFAULT);
+ if (m == NULL) {
+ VM_OBJECT_WUNLOCK(object);
+ if (shmfd->shm_lp_alloc_policy ==
+ SHM_LARGEPAGE_ALLOC_NOWAIT ||
+ (shmfd->shm_lp_alloc_policy ==
+ SHM_LARGEPAGE_ALLOC_DEFAULT &&
+ try >= largepage_reclaim_tries))
+ return (ENOMEM);
+ error = vm_page_reclaim_contig(aflags,
+ pagesizes[shmfd->shm_lp_psind] / PAGE_SIZE, 0, ~0,
+ pagesizes[shmfd->shm_lp_psind], 0) ? 0 :
+ vm_wait_intr(object);
+ if (error != 0)
+ return (error);
+ try++;
+ VM_OBJECT_WLOCK(object);
+ continue;
+ }
+ try = 0;
+ for (i = 0; i < pagesizes[shmfd->shm_lp_psind] / PAGE_SIZE;
+ i++) {
+ if ((m[i].flags & PG_ZERO) == 0)
+ pmap_zero_page(&m[i]);
+ vm_page_valid(&m[i]);
+ vm_page_xunbusy(&m[i]);
+ }
+ object->size += OFF_TO_IDX(pagesizes[shmfd->shm_lp_psind]);
+ shmfd->shm_size += pagesizes[shmfd->shm_lp_psind];
+ atomic_add_long(&count_largepages[shmfd->shm_lp_psind], 1);
+ }
+ return (0);
+}
+
int
shm_dotruncate(struct shmfd *shmfd, off_t length)
{
@@ -601,7 +826,8 @@
rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
&shmfd->shm_mtx);
VM_OBJECT_WLOCK(shmfd->shm_object);
- error = shm_dotruncate_locked(shmfd, length, rl_cookie);
+ error = shm_largepage(shmfd) ? shm_dotruncate_largepage(shmfd, length,
+ rl_cookie) : shm_dotruncate_locked(shmfd, length, rl_cookie);
VM_OBJECT_WUNLOCK(shmfd->shm_object);
rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
return (error);
@@ -612,7 +838,7 @@
* routines.
*/
struct shmfd *
-shm_alloc(struct ucred *ucred, mode_t mode)
+shm_alloc(struct ucred *ucred, mode_t mode, bool largepage)
{
struct shmfd *shmfd;
@@ -621,8 +847,15 @@
shmfd->shm_uid = ucred->cr_uid;
shmfd->shm_gid = ucred->cr_gid;
shmfd->shm_mode = mode;
- shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL,
- shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
+ if (largepage) {
+ shmfd->shm_object = phys_pager_allocate(NULL,
+ &shm_largepage_phys_ops, NULL, shmfd->shm_size,
+ VM_PROT_DEFAULT, 0, ucred);
+ shmfd->shm_lp_alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT;
+ } else {
+ shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL,
+ shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
+ }
KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
vfs_timestamp(&shmfd->shm_birthtime);
shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
@@ -684,14 +917,11 @@
return (error);
}
-/*
- * Dictionary management. We maintain an in-kernel dictionary to map
- * paths to shmfd objects. We use the FNV hash on the path to store
- * the mappings in a hash table.
- */
static void
shm_init(void *arg)
{
+ char name[32];
+ int i;
mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
sx_init(&shm_dict_lock, "shm dictionary");
@@ -699,9 +929,32 @@
new_unrhdr64(&shm_ino_unr, 1);
shm_dev_ino = devfs_alloc_cdp_inode();
KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
+
+ for (i = 1; i < MAXPAGESIZES; i++) {
+ if (pagesizes[i] == 0)
+ break;
+#define M (1024 * 1024)
+#define G (1024 * M)
+ if (pagesizes[i] >= G)
+ snprintf(name, sizeof(name), "%luG", pagesizes[i] / G);
+ else if (pagesizes[i] >= M)
+ snprintf(name, sizeof(name), "%luM", pagesizes[i] / M);
+ else
+ snprintf(name, sizeof(name), "%lu", pagesizes[i]);
+#undef G
+#undef M
+ SYSCTL_ADD_ULONG(NULL, SYSCTL_STATIC_CHILDREN(_vm_largepages),
+ OID_AUTO, name, CTLFLAG_RD, &count_largepages[i],
+ "number of non-transient largepages allocated");
+ }
}
SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
+/*
+ * Dictionary management. We maintain an in-kernel dictionary to map
+ * paths to shmfd objects. We use the FNV hash on the path to store
+ * the mappings in a hash table.
+ */
static struct shmfd *
shm_lookup(char *path, Fnv32_t fnv)
{
@@ -773,8 +1026,10 @@
Fnv32_t fnv;
mode_t cmode;
int error, fd, initial_seals;
+ bool largepage;
- if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE)) != 0)
+ if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE |
+ SHM_LARGEPAGE)) != 0)
return (EINVAL);
initial_seals = F_SEAL_SEAL;
@@ -798,6 +1053,8 @@
if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
return (EINVAL);
+ largepage = (shmflags & SHM_LARGEPAGE) != 0;
+
/*
* Currently only F_SEAL_SEAL may be set when creating or opening shmfd.
* If the decision is made later to allow additional seals, care must be
@@ -831,7 +1088,7 @@
fdrop(fp, td);
return (EINVAL);
}
- shmfd = shm_alloc(td->td_ucred, cmode);
+ shmfd = shm_alloc(td->td_ucred, cmode, largepage);
shmfd->shm_seals = initial_seals;
} else {
error = shm_copyin_path(td, userpath, &path);
@@ -853,7 +1110,8 @@
path);
if (error == 0) {
#endif
- shmfd = shm_alloc(td->td_ucred, cmode);
+ shmfd = shm_alloc(td->td_ucred, cmode,
+ largepage);
shmfd->shm_seals = initial_seals;
shm_insert(path, fnv, shmfd);
#ifdef MAC
@@ -948,7 +1206,8 @@
}
shmfd->shm_flags = shmflags;
- finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
+ finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd,
+ shm_largepage(shmfd) ? &shm_ops_large : &shm_ops);
td->td_retval[0] = fd;
fdrop(fp, td);
@@ -1136,7 +1395,98 @@
return (error);
}
-int
+static int
+shm_mmap_large(struct shmfd *shmfd, vm_map_t map, vm_offset_t *addr,
+ vm_size_t size, vm_prot_t prot, vm_prot_t max_prot, int flags,
+ vm_ooffset_t foff, bool writecounted, struct thread *td)
+{
+ struct vmspace *vms;
+ vm_map_entry_t next_entry, prev_entry;
+ vm_offset_t mask, maxaddr;
+ int docow, error, rv, try;
+ bool curmap;
+
+ if (shmfd->shm_lp_psind == 0)
+ return (EINVAL);
+
+ if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_EXCL |
+ MAP_NOCORE | MAP_32BIT)) != 0)
+ return (EINVAL);
+
+ vms = td->td_proc->p_vmspace;
+ curmap = map == &vms->vm_map;
+ if (curmap) {
+ error = kern_mmap_racct_check(td, map, size);
+ if (error != 0)
+ return (error);
+ }
+
+ docow = shmfd->shm_lp_psind << MAP_SPLIT_BOUNDARY_SHIFT;
+ if ((flags & MAP_NOCORE) != 0)
+ docow |= MAP_DISABLE_COREDUMP;
+ if ((flags & MAP_SHARED) != 0)
+ docow |= MAP_INHERIT_SHARE;
+ if (writecounted)
+ docow |= MAP_WRITECOUNT;
+
+ mask = pagesizes[shmfd->shm_lp_psind] - 1;
+ if ((foff & mask) != 0)
+ return (EINVAL);
+ maxaddr = vm_map_max(map);
+#ifdef MAP_32BIT
+ if ((flags & MAP_32BIT) != 0 && maxaddr > MAP_32BIT_MAX_ADDR)
+ maxaddr = MAP_32BIT_MAX_ADDR;
+#endif
+ if (size == 0 || (size & mask) != 0 ||
+ (*addr != 0 && ((*addr & mask) != 0 ||
+ *addr + size < *addr || *addr + size > maxaddr)))
+ return (EINVAL);
+
+ vm_map_lock(map);
+ if ((flags & MAP_FIXED) == 0) {
+ try = 1;
+ if (curmap && (*addr == 0 ||
+ (*addr >= round_page((vm_offset_t)vms->vm_taddr) &&
+ *addr < round_page((vm_offset_t)vms->vm_daddr +
+ lim_max(td, RLIMIT_DATA))))) {
+ *addr = roundup2((vm_offset_t)vms->vm_daddr +
+ lim_max(td, RLIMIT_DATA),
+ pagesizes[shmfd->shm_lp_psind]);
+ }
+again:
+ rv = vm_map_find_aligned(map, addr, size, maxaddr,
+ pagesizes[shmfd->shm_lp_psind]);
+ if (rv != KERN_SUCCESS) {
+ if (try == 1) {
+ try = 2;
+ *addr = vm_map_min(map);
+ if ((*addr & mask) != 0)
+ *addr = (*addr + mask) & mask;
+ goto again;
+ }
+ goto fail1;
+ }
+ } else if ((flags & MAP_EXCL) == 0) {
+ vm_map_delete(map, *addr, *addr + size);
+ } else {
+ error = ENOSPC;
+ if (vm_map_lookup_entry(map, *addr, &prev_entry))
+ goto fail;
+ next_entry = vm_map_entry_succ(prev_entry);
+ if (next_entry->start < *addr + size)
+ goto fail;
+ }
+
+ rv = vm_map_insert(map, shmfd->shm_object, foff, *addr, *addr + size,
+ prot, max_prot, docow);
+fail1:
+ error = vm_mmap_to_errno(rv);
+fail:
+ vm_map_unlock(map);
+ return (error);
+}
+
+static int
shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize,
vm_prot_t prot, vm_prot_t cap_maxprot, int flags,
vm_ooffset_t foff, struct thread *td)
@@ -1208,8 +1558,13 @@
if (writecnt)
vm_pager_update_writecount(shmfd->shm_object, 0, objsize);
- error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags,
- shmfd->shm_object, foff, writecnt, td);
+ if (shm_largepage(shmfd)) {
+ error = shm_mmap_large(shmfd, map, addr, objsize, prot,
+ maxprot, flags, foff, writecnt, td);
+ } else {
+ error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags,
+ shmfd->shm_object, foff, writecnt, td);
+ }
if (error != 0) {
if (writecnt)
vm_pager_release_writecount(shmfd->shm_object, 0,
Index: sys/sys/filio.h
===================================================================
--- sys/sys/filio.h
+++ sys/sys/filio.h
@@ -70,6 +70,7 @@
};
/* Get the file's bmap info for the logical block bn. */
#define FIOBMAP2 _IOWR('f', 99, struct fiobmap2_arg)
+#define FIOSHMLPGCNF _IOW('f', 100, struct shm_largepage_conf)
#ifdef _KERNEL
#ifdef COMPAT_FREEBSD32
Index: sys/sys/mman.h
===================================================================
--- sys/sys/mman.h
+++ sys/sys/mman.h
@@ -191,6 +191,17 @@
*/
#define SHM_ALLOW_SEALING 0x00000001
#define SHM_GROW_ON_WRITE 0x00000002
+#define SHM_LARGEPAGE 0x00000004
+
+#define SHM_LARGEPAGE_ALLOC_DEFAULT 0
+#define SHM_LARGEPAGE_ALLOC_NOWAIT 1
+#define SHM_LARGEPAGE_ALLOC_HARD 2
+
+struct shm_largepage_conf {
+ int psind;
+ int alloc_policy;
+ int pad[10];
+};
/*
* Flags for memfd_create().
@@ -198,7 +209,6 @@
#define MFD_CLOEXEC 0x00000001
#define MFD_ALLOW_SEALING 0x00000002
-/* UNSUPPORTED */
#define MFD_HUGETLB 0x00000004
#define MFD_HUGE_MASK 0xFC000000
@@ -281,6 +291,10 @@
int shm_flags;
int shm_seals;
+
+ /* largepage config */
+ int shm_lp_psind;
+ int shm_lp_alloc_policy;
};
#endif
@@ -289,12 +303,15 @@
int shm_unmap(struct file *fp, void *mem, size_t size);
int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags);
-struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode);
+struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode, bool largepage);
struct shmfd *shm_hold(struct shmfd *shmfd);
void shm_drop(struct shmfd *shmfd);
int shm_dotruncate(struct shmfd *shmfd, off_t length);
-extern struct fileops shm_ops;
+extern struct fileops shm_ops, shm_ops_large;
+
+#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31)
+
#else /* !_KERNEL */
__BEGIN_DECLS
Index: sys/sys/syscallsubr.h
===================================================================
--- sys/sys/syscallsubr.h
+++ sys/sys/syscallsubr.h
@@ -62,6 +62,7 @@
struct stat;
struct thr_param;
struct uio;
+struct vm_map;
typedef int (*mmap_check_fp_fn)(struct file *, int, int, int);
@@ -197,8 +198,10 @@
size_t len);
int kern_mmap(struct thread *td, uintptr_t addr, size_t len, int prot,
int flags, int fd, off_t pos);
-int kern_mmap_req(struct thread *td, const struct mmap_req *mrp);
+int kern_mmap_racct_check(struct thread *td, struct vm_map *map,
+ vm_size_t size);
int kern_mmap_maxprot(struct proc *p, int prot);
+int kern_mmap_req(struct thread *td, const struct mmap_req *mrp);
int kern_mprotect(struct thread *td, uintptr_t addr, size_t size, int prot);
int kern_msgctl(struct thread *, int, int, struct msqid_ds *);
int kern_msgrcv(struct thread *, int, void *, size_t, long, int, long *);
Index: sys/vm/phys_pager.c
===================================================================
--- sys/vm/phys_pager.c
+++ sys/vm/phys_pager.c
@@ -51,6 +51,20 @@
/* protect access to phys_pager_object_list */
static struct mtx phys_pager_mtx;
+static int default_phys_pager_getpages(vm_object_t object, vm_page_t *m,
+ int count, int *rbehind, int *rahead);
+static int default_phys_pager_populate(vm_object_t object, vm_pindex_t pidx,
+ int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last);
+static boolean_t default_phys_pager_haspage(vm_object_t object,
+ vm_pindex_t pindex, int *before, int *after);
+struct phys_pager_ops default_phys_pg_ops = {
+ .phys_pg_getpages = default_phys_pager_getpages,
+ .phys_pg_populate = default_phys_pager_populate,
+ .phys_pg_haspage = default_phys_pager_haspage,
+ .phys_pg_ctor = NULL,
+ .phys_pg_dtor = NULL,
+};
+
static void
phys_pager_init(void)
{
@@ -59,12 +73,13 @@
mtx_init(&phys_pager_mtx, "phys_pager list", NULL, MTX_DEF);
}
-static vm_object_t
-phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
- vm_ooffset_t foff, struct ucred *cred)
+vm_object_t
+phys_pager_allocate(void *handle, struct phys_pager_ops *ops, void *data,
+ vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred)
{
vm_object_t object, object1;
vm_pindex_t pindex;
+ bool init;
/*
* Offset should be page aligned.
@@ -73,6 +88,7 @@
return (NULL);
pindex = OFF_TO_IDX(foff + PAGE_MASK + size);
+ init = true;
if (handle != NULL) {
mtx_lock(&phys_pager_mtx);
@@ -97,11 +113,15 @@
*/
if (pindex > object->size)
object->size = pindex;
+ init = false;
} else {
object = object1;
object1 = NULL;
object->handle = handle;
- vm_object_set_flag(object, OBJ_POPULATE);
+ object->un_pager.phys.ops = ops;
+ object->un_pager.phys.data_ptr = data;
+ if (ops->phys_pg_populate != NULL)
+ vm_object_set_flag(object, OBJ_POPULATE);
TAILQ_INSERT_TAIL(&phys_pager_object_list,
object, pager_object_list);
}
@@ -113,12 +133,25 @@
vm_object_deallocate(object1);
} else {
object = vm_object_allocate(OBJT_PHYS, pindex);
- vm_object_set_flag(object, OBJ_POPULATE);
+ object->un_pager.phys.ops = ops;
+ object->un_pager.phys.data_ptr = data;
+ if (ops->phys_pg_populate != NULL)
+ vm_object_set_flag(object, OBJ_POPULATE);
}
+ if (init && ops->phys_pg_ctor != NULL)
+ ops->phys_pg_ctor(object, prot, foff, cred);
return (object);
}
+static vm_object_t
+phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
+ vm_ooffset_t foff, struct ucred *ucred)
+{
+ return (phys_pager_allocate(handle, &default_phys_pg_ops, NULL,
+ size, prot, foff, ucred));
+}
+
static void
phys_pager_dealloc(vm_object_t object)
{
@@ -130,16 +163,18 @@
mtx_unlock(&phys_pager_mtx);
VM_OBJECT_WLOCK(object);
}
- object->handle = NULL;
object->type = OBJT_DEAD;
+ if (object->un_pager.phys.ops->phys_pg_dtor != NULL)
+ object->un_pager.phys.ops->phys_pg_dtor(object);
+ object->handle = NULL;
}
/*
* Fill as many pages as vm_fault has allocated for us.
*/
static int
-phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
- int *rahead)
+default_phys_pager_getpages(vm_object_t object, vm_page_t *m, int count,
+ int *rbehind, int *rahead)
{
int i;
@@ -161,6 +196,14 @@
return (VM_PAGER_OK);
}
+static int
+phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
+ int *rahead)
+{
+ return (object->un_pager.phys.ops->phys_pg_getpages(object, m,
+ count, rbehind, rahead));
+}
+
/*
* Implement a pretty aggressive clustered getpages strategy. Hint that
* everything in an entire 4MB window should be prefaulted at once.
@@ -185,7 +228,7 @@
#define PHYSALLOC 16
static int
-phys_pager_populate(vm_object_t object, vm_pindex_t pidx,
+default_phys_pager_populate(vm_object_t object, vm_pindex_t pidx,
int fault_type __unused, vm_prot_t max_prot __unused, vm_pindex_t *first,
vm_pindex_t *last)
{
@@ -216,6 +259,14 @@
return (VM_PAGER_OK);
}
+static int
+phys_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type,
+ vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
+{
+ return (object->un_pager.phys.ops->phys_pg_populate(object, pidx,
+ fault_type, max_prot, first, last));
+}
+
static void
phys_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync,
int *rtvals)
@@ -225,7 +276,7 @@
}
static boolean_t
-phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+default_phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
int *after)
{
vm_pindex_t base, end;
@@ -239,6 +290,14 @@
return (TRUE);
}
+static boolean_t
+phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+ int *after)
+{
+ return (object->un_pager.phys.ops->phys_pg_haspage(object, pindex,
+ before, after));
+}
+
struct pagerops physpagerops = {
.pgo_init = phys_pager_init,
.pgo_alloc = phys_pager_alloc,
Index: sys/vm/pmap.h
===================================================================
--- sys/vm/pmap.h
+++ sys/vm/pmap.h
@@ -106,6 +106,7 @@
*/
#define PMAP_ENTER_NOSLEEP 0x00000100
#define PMAP_ENTER_WIRED 0x00000200
+#define PMAP_ENTER_LARGEPAGE 0x00000400
#define PMAP_ENTER_RESERVED 0xFF000000
/*
@@ -171,5 +172,8 @@
#define pmap_resident_count(pm) ((pm)->pm_stats.resident_count)
#define pmap_wired_count(pm) ((pm)->pm_stats.wired_count)
+extern u_long pmap_superpagesize[];
+extern u_int pmap_superpagesize_nitems;
+
#endif /* _KERNEL */
#endif /* _PMAP_VM_ */
Index: sys/vm/uma_core.c
===================================================================
--- sys/vm/uma_core.c
+++ sys/vm/uma_core.c
@@ -3617,7 +3617,7 @@
break;
if (rr && vm_domainset_iter_policy(&di, &domain) != 0) {
if ((flags & M_WAITOK) != 0) {
- vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
+ vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0);
goto restart;
}
break;
@@ -4777,7 +4777,7 @@
break;
}
if (vm_domainset_iter_policy(&di, &domain) != 0)
- vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
+ vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0);
}
}
}
Index: sys/vm/vm_domainset.h
===================================================================
--- sys/vm/vm_domainset.h
+++ sys/vm/vm_domainset.h
@@ -50,6 +50,6 @@
void vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *,
struct domainset_ref *, int *, int *);
-void vm_wait_doms(const domainset_t *);
+int vm_wait_doms(const domainset_t *, int mflags);
#endif /* __VM_DOMAINSET_H__ */
Index: sys/vm/vm_domainset.c
===================================================================
--- sys/vm/vm_domainset.c
+++ sys/vm/vm_domainset.c
@@ -245,7 +245,7 @@
/* Wait for one of the domains to accumulate some free pages. */
if (obj != NULL)
VM_OBJECT_WUNLOCK(obj);
- vm_wait_doms(&di->di_domain->ds_mask);
+ vm_wait_doms(&di->di_domain->ds_mask, 0);
if (obj != NULL)
VM_OBJECT_WLOCK(obj);
if ((di->di_flags & VM_ALLOC_WAITFAIL) != 0)
@@ -310,7 +310,7 @@
return (ENOMEM);
/* Wait for one of the domains to accumulate some free pages. */
- vm_wait_doms(&di->di_domain->ds_mask);
+ vm_wait_doms(&di->di_domain->ds_mask, 0);
/* Restart the search. */
vm_domainset_iter_first(di, domain);
Index: sys/vm/vm_fault.c
===================================================================
--- sys/vm/vm_fault.c
+++ sys/vm/vm_fault.c
@@ -420,7 +420,7 @@
vm_offset_t vaddr;
vm_page_t m;
vm_pindex_t map_first, map_last, pager_first, pager_last, pidx;
- int i, npages, psind, rv;
+ int bdry_idx, i, npages, psind, rv;
MPASS(fs->object == fs->first_object);
VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
@@ -442,7 +442,8 @@
* to the driver.
*/
rv = vm_pager_populate(fs->first_object, fs->first_pindex,
- fs->fault_type, fs->entry->max_protection, &pager_first, &pager_last);
+ fs->fault_type, fs->entry->max_protection, &pager_first,
+ &pager_last);
VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
if (rv == VM_PAGER_BAD) {
@@ -465,15 +466,48 @@
MPASS(pager_last < fs->first_object->size);
vm_fault_restore_map_lock(fs);
+ bdry_idx = (fs->entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
+ MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
if (fs->map->timestamp != fs->map_generation) {
- vm_fault_populate_cleanup(fs->first_object, pager_first,
- pager_last);
+ if (bdry_idx == 0) {
+ vm_fault_populate_cleanup(fs->first_object, pager_first,
+ pager_last);
+ } else {
+ m = vm_page_lookup(fs->first_object, pager_first);
+ if (m != fs->m)
+ vm_page_xunbusy(m);
+ }
return (KERN_RESTART);
}
/*
* The map is unchanged after our last unlock. Process the fault.
*
+ * First, the special case of largepage mappings, where
+ * populate only busies the first page in superpage run.
+ */
+ if (bdry_idx != 0) {
+ m = vm_page_lookup(fs->first_object, pager_first);
+ vm_fault_populate_check_page(m);
+ VM_OBJECT_WUNLOCK(fs->first_object);
+ vaddr = fs->entry->start + IDX_TO_OFF(pager_first) -
+ fs->entry->offset;
+ /* assert alignment for entry */
+ KASSERT((vaddr & (pagesizes[bdry_idx] - 1)) == 0,
+ ("unaligned superpage start %#jx pager_first %#jx offset %#jx vaddr %#jx",
+ (uintmax_t)fs->entry->start, (uintmax_t)pager_first,
+ (uintmax_t)fs->entry->offset, (uintmax_t)vaddr));
+ KASSERT((VM_PAGE_TO_PHYS(m) & (pagesizes[bdry_idx] - 1)) == 0,
+ ("unaligned superpage m %p %#jx", m,
+ (uintmax_t)VM_PAGE_TO_PHYS(m)));
+ rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot,
+ fs->fault_type | PMAP_ENTER_LARGEPAGE, bdry_idx);
+ VM_OBJECT_WLOCK(fs->first_object);
+ vm_page_xunbusy(m);
+ goto out;
+ }
+
+ /*
* The range [pager_first, pager_last] that is given to the
* pager is only a hint. The pager may populate any range
* within the object that includes the requested page index.
@@ -539,6 +573,7 @@
vm_page_xunbusy(&m[i]);
}
}
+out:
curthread->td_ru.ru_majflt++;
return (KERN_SUCCESS);
}
@@ -1255,6 +1290,7 @@
* multiple page faults of a similar type to run in parallel.
*/
if (fs.vp == NULL /* avoid locked vnode leak */ &&
+ (fs.entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) == 0 &&
(fs.fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0) {
VM_OBJECT_RLOCK(fs.first_object);
rv = vm_fault_soft_fast(&fs);
@@ -1287,6 +1323,27 @@
*/
fs.object = fs.first_object;
fs.pindex = fs.first_pindex;
+
+ if ((fs.entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) != 0) {
+ rv = vm_fault_allocate(&fs);
+ switch (rv) {
+ case KERN_RESTART:
+ unlock_and_deallocate(&fs);
+ /* FALLTHROUGH */
+ case KERN_RESOURCE_SHORTAGE:
+ goto RetryFault;
+ case KERN_SUCCESS:
+ case KERN_FAILURE:
+ case KERN_OUT_OF_BOUNDS:
+ unlock_and_deallocate(&fs);
+ return (rv);
+ case KERN_NOT_RECEIVER:
+ break;
+ default:
+ panic("vm_fault: Unhandled rv %d", rv);
+ }
+ }
+
while (TRUE) {
KASSERT(fs.m == NULL,
("page still set %p at loop start", fs.m));
Index: sys/vm/vm_glue.c
===================================================================
--- sys/vm/vm_glue.c
+++ sys/vm/vm_glue.c
@@ -565,7 +565,7 @@
}
dset = td2->td_domain.dr_policy;
while (vm_page_count_severe_set(&dset->ds_mask)) {
- vm_wait_doms(&dset->ds_mask);
+ vm_wait_doms(&dset->ds_mask, 0);
}
if ((flags & RFMEM) == 0) {
Index: sys/vm/vm_map.h
===================================================================
--- sys/vm/vm_map.h
+++ sys/vm/vm_map.h
@@ -149,6 +149,10 @@
#define MAP_ENTRY_STACK_GAP_UP 0x00040000
#define MAP_ENTRY_HEADER 0x00080000
+#define MAP_ENTRY_SPLIT_BOUNDARY_MASK 0x00300000
+
+#define MAP_ENTRY_SPLIT_BOUNDARY_SHIFT 20
+
#ifdef _KERNEL
static __inline u_char
vm_map_entry_behavior(vm_map_entry_t entry)
@@ -374,6 +378,9 @@
#define MAP_CREATE_STACK_GAP_UP 0x00010000
#define MAP_CREATE_STACK_GAP_DN 0x00020000
#define MAP_VN_EXEC 0x00040000
+#define MAP_SPLIT_BOUNDARY_MASK 0x00180000
+
+#define MAP_SPLIT_BOUNDARY_SHIFT 19
/*
* vm_fault option flags
@@ -462,6 +469,8 @@
vm_offset_t, int, vm_prot_t, vm_prot_t, int);
int vm_map_find_min(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *,
vm_size_t, vm_offset_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int);
+int vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
+ vm_offset_t max_addr, vm_offset_t alignment);
int vm_map_fixed(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_size_t,
vm_prot_t, vm_prot_t, int);
vm_offset_t vm_map_findspace(vm_map_t, vm_offset_t, vm_size_t);
Index: sys/vm/vm_map.c
===================================================================
--- sys/vm/vm_map.c
+++ sys/vm/vm_map.c
@@ -1603,13 +1603,17 @@
struct ucred *cred;
vm_eflags_t protoeflags;
vm_inherit_t inheritance;
+ u_long bdry;
+ u_int bidx;
VM_MAP_ASSERT_LOCKED(map);
KASSERT(object != kernel_object ||
(cow & MAP_COPY_ON_WRITE) == 0,
("vm_map_insert: kernel object and COW"));
- KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
- ("vm_map_insert: paradoxical MAP_NOFAULT request"));
+ KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 ||
+ (cow & MAP_SPLIT_BOUNDARY_MASK) != 0,
+ ("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x",
+ object, cow));
KASSERT((prot & ~max) == 0,
("prot %#x is not subset of max_prot %#x", prot, max));
@@ -1664,6 +1668,17 @@
inheritance = VM_INHERIT_SHARE;
else
inheritance = VM_INHERIT_DEFAULT;
+ if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) {
+ /* This magically ignores index 0, for usual page size. */
+ bidx = (cow & MAP_SPLIT_BOUNDARY_MASK) >>
+ MAP_SPLIT_BOUNDARY_SHIFT;
+ if (bidx >= MAXPAGESIZES)
+ return (KERN_INVALID_ARGUMENT);
+ bdry = pagesizes[bidx] - 1;
+ if ((start & bdry) != 0 || (end & bdry) != 0)
+ return (KERN_INVALID_ARGUMENT);
+ protoeflags |= bidx << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
+ }
cred = NULL;
if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
@@ -1958,8 +1973,6 @@
&aslr_restarts, 0,
"Number of aslr failures");
-#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31)
-
/*
* Searches for the specified amount of free space in the given map with the
* specified alignment. Performs an address-ordered, first-fit search from
@@ -2027,6 +2040,19 @@
}
}
+int
+vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
+ vm_offset_t max_addr, vm_offset_t alignment)
+{
+ /* XXXKIB ASLR eh ? */
+ *addr = vm_map_findspace(map, *addr, length);
+ if (*addr + length > vm_map_max(map) ||
+ (max_addr != 0 && *addr + length > max_addr))
+ return (KERN_NO_SPACE);
+ return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr,
+ alignment));
+}
+
/*
* vm_map_find finds an unallocated region in the target address
* map with the given length. The search is defined to be
@@ -2374,31 +2400,40 @@
* the specified address; if necessary,
* it splits the entry into two.
*/
-static inline void
-vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
+static int
+vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr)
{
vm_map_entry_t new_entry;
+ int bdry_idx;
if (!map->system_map)
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
"%s: map %p entry %p start 0x%jx", __func__, map, entry,
- (uintmax_t)start);
+ (uintmax_t)startaddr);
- if (start <= entry->start)
- return;
+ if (startaddr <= entry->start)
+ return (KERN_SUCCESS);
VM_MAP_ASSERT_LOCKED(map);
- KASSERT(entry->end > start && entry->start < start,
+ KASSERT(entry->end > startaddr && entry->start < startaddr,
("%s: invalid clip of entry %p", __func__, entry));
+ bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
+ MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
+ if (bdry_idx != 0) {
+ if ((startaddr & (pagesizes[bdry_idx] - 1)) != 0)
+ return (KERN_INVALID_ARGUMENT);
+ }
+
new_entry = vm_map_entry_clone(map, entry);
/*
* Split off the front portion. Insert the new entry BEFORE this one,
* so that this entry has the specified starting address.
*/
- new_entry->end = start;
+ new_entry->end = startaddr;
vm_map_entry_link(map, new_entry);
+ return (KERN_SUCCESS);
}
/*
@@ -2408,11 +2443,12 @@
* the interior of the entry. Return entry after 'start', and in
* prev_entry set the entry before 'start'.
*/
-static inline vm_map_entry_t
+static int
vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start,
- vm_map_entry_t *prev_entry)
+ vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry)
{
vm_map_entry_t entry;
+ int rv;
if (!map->system_map)
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
@@ -2421,11 +2457,14 @@
if (vm_map_lookup_entry(map, start, prev_entry)) {
entry = *prev_entry;
- vm_map_clip_start(map, entry, start);
+ rv = vm_map_clip_start(map, entry, start);
+ if (rv != KERN_SUCCESS)
+ return (rv);
*prev_entry = vm_map_entry_pred(entry);
} else
entry = vm_map_entry_succ(*prev_entry);
- return (entry);
+ *res_entry = entry;
+ return (KERN_SUCCESS);
}
/*
@@ -2435,31 +2474,41 @@
* the specified address; if necessary,
* it splits the entry into two.
*/
-static inline void
-vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
+static int
+vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr)
{
vm_map_entry_t new_entry;
+ int bdry_idx;
if (!map->system_map)
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
"%s: map %p entry %p end 0x%jx", __func__, map, entry,
- (uintmax_t)end);
+ (uintmax_t)endaddr);
- if (end >= entry->end)
- return;
+ if (endaddr >= entry->end)
+ return (KERN_SUCCESS);
VM_MAP_ASSERT_LOCKED(map);
- KASSERT(entry->start < end && entry->end > end,
+ KASSERT(entry->start < endaddr && entry->end > endaddr,
("%s: invalid clip of entry %p", __func__, entry));
+ bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
+ MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
+ if (bdry_idx != 0) {
+ if ((endaddr & (pagesizes[bdry_idx] - 1)) != 0)
+ return (KERN_INVALID_ARGUMENT);
+ }
+
new_entry = vm_map_entry_clone(map, entry);
/*
* Split off the back portion. Insert the new entry AFTER this one,
* so that this entry has the specified ending address.
*/
- new_entry->start = end;
+ new_entry->start = endaddr;
vm_map_entry_link(map, new_entry);
+
+ return (KERN_SUCCESS);
}
/*
@@ -2501,12 +2550,17 @@
if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end &&
(entry->eflags & MAP_ENTRY_COW) == 0 &&
entry->object.vm_object == NULL) {
- vm_map_clip_start(map, entry, start);
- vm_map_clip_end(map, entry, end);
+ result = vm_map_clip_start(map, entry, start);
+ if (result != KERN_SUCCESS)
+ goto unlock;
+ result = vm_map_clip_end(map, entry, end);
+ if (result != KERN_SUCCESS)
+ goto unlock;
entry->object.sub_map = submap;
entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
result = KERN_SUCCESS;
}
+unlock:
vm_map_unlock(map);
if (result != KERN_SUCCESS) {
@@ -2693,11 +2747,18 @@
* of this loop early and let the next loop simplify the entries, since
* some may now be mergeable.
*/
- rv = KERN_SUCCESS;
- vm_map_clip_start(map, first_entry, start);
+ rv = vm_map_clip_start(map, first_entry, start);
+ if (rv != KERN_SUCCESS) {
+ vm_map_unlock(map);
+ return (rv);
+ }
for (entry = first_entry; entry->start < end;
entry = vm_map_entry_succ(entry)) {
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS) {
+ vm_map_unlock(map);
+ return (rv);
+ }
if (set_max ||
((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 ||
@@ -2817,6 +2878,7 @@
int behav)
{
vm_map_entry_t entry, prev_entry;
+ int rv;
bool modify_map;
/*
@@ -2862,13 +2924,22 @@
* We clip the vm_map_entry so that behavioral changes are
* limited to the specified address range.
*/
- for (entry = vm_map_lookup_clip_start(map, start, &prev_entry);
- entry->start < end;
- prev_entry = entry, entry = vm_map_entry_succ(entry)) {
+ rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry);
+ if (rv != KERN_SUCCESS) {
+ vm_map_unlock(map);
+ return (vm_mmap_to_errno(rv));
+ }
+
+ for (; entry->start < end; prev_entry = entry,
+ entry = vm_map_entry_succ(entry)) {
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
continue;
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS) {
+ vm_map_unlock(map);
+ return (vm_mmap_to_errno(rv));
+ }
switch (behav) {
case MADV_NORMAL:
@@ -3003,6 +3074,7 @@
vm_inherit_t new_inheritance)
{
vm_map_entry_t entry, prev_entry;
+ int rv;
switch (new_inheritance) {
case VM_INHERIT_NONE:
@@ -3013,14 +3085,19 @@
default:
return (KERN_INVALID_ARGUMENT);
}
+ rv = KERN_SUCCESS;
if (start == end)
- return (KERN_SUCCESS);
+ return (rv);
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
- for (entry = vm_map_lookup_clip_start(map, start, &prev_entry);
- entry->start < end;
- prev_entry = entry, entry = vm_map_entry_succ(entry)) {
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry);
+ if (rv != KERN_SUCCESS)
+ goto unlock;
+ for (; entry->start < end; prev_entry = entry,
+ entry = vm_map_entry_succ(entry)) {
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS)
+ goto unlock;
if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
new_inheritance != VM_INHERIT_ZERO)
entry->inheritance = new_inheritance;
@@ -3028,7 +3105,8 @@
}
vm_map_try_merge_entries(map, prev_entry, entry);
vm_map_unlock(map);
- return (KERN_SUCCESS);
+unlock:
+ return (rv);
}
/*
@@ -3127,8 +3205,13 @@
next_entry : NULL;
continue;
}
- vm_map_clip_start(map, entry, start);
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_clip_start(map, entry, start);
+ if (rv != KERN_SUCCESS)
+ break;
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS)
+ break;
+
/*
* Mark the entry in case the map lock is released. (See
* above.)
@@ -3335,8 +3418,13 @@
next_entry : NULL;
continue;
}
- vm_map_clip_start(map, entry, start);
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_clip_start(map, entry, start);
+ if (rv != KERN_SUCCESS)
+ break;
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS)
+ break;
+
/*
* Mark the entry in case the map lock is released. (See
* above.)
@@ -3737,7 +3825,8 @@
int
vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
{
- vm_map_entry_t entry, next_entry;
+ vm_map_entry_t entry, next_entry, scratch_entry;
+ int rv;
VM_MAP_ASSERT_LOCKED(map);
@@ -3748,8 +3837,10 @@
* Find the start of the region, and clip it.
* Step through all entries in this region.
*/
- for (entry = vm_map_lookup_clip_start(map, start, &entry);
- entry->start < end; entry = next_entry) {
+ rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry);
+ if (rv != KERN_SUCCESS)
+ return (rv);
+ for (; entry->start < end; entry = next_entry) {
/*
* Wait for wiring or unwiring of an entry to complete.
* Also wait for any system wirings to disappear on
@@ -3773,13 +3864,19 @@
* Specifically, the entry may have been
* clipped, merged, or deleted.
*/
- next_entry = vm_map_lookup_clip_start(map,
- saved_start, &next_entry);
+ rv = vm_map_lookup_clip_start(map, saved_start,
+ &next_entry, &scratch_entry);
+ if (rv != KERN_SUCCESS)
+ break;
} else
next_entry = entry;
continue;
}
- vm_map_clip_end(map, entry, end);
+
+ /* XXXKIB or delete to the upper superpage boundary ? */
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS)
+ break;
next_entry = vm_map_entry_succ(entry);
/*
@@ -3809,7 +3906,7 @@
*/
vm_map_entry_delete(map, entry);
}
- return (KERN_SUCCESS);
+ return (rv);
}
/*
Index: sys/vm/vm_mmap.c
===================================================================
--- sys/vm/vm_mmap.c
+++ sys/vm/vm_mmap.c
@@ -218,14 +218,14 @@
struct file *fp;
struct proc *p;
off_t pos;
- vm_offset_t addr;
+ vm_offset_t addr, orig_addr;
vm_size_t len, pageoff, size;
vm_prot_t cap_maxprot;
int align, error, fd, flags, max_prot, prot;
cap_rights_t rights;
mmap_check_fp_fn check_fp_fn;
- addr = mrp->mr_hint;
+ orig_addr = addr = mrp->mr_hint;
len = mrp->mr_len;
prot = mrp->mr_prot;
flags = mrp->mr_flags;
@@ -421,6 +421,8 @@
if (error != 0)
goto done;
}
+ if (fp->f_ops == &shm_ops_large)
+ addr = orig_addr;
/* This relies on VM_PROT_* matching PROT_*. */
error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
max_prot & cap_maxprot, flags, pos, td);
@@ -1510,6 +1512,39 @@
return (error);
}
+int
+kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size)
+{
+ int error;
+
+ RACCT_PROC_LOCK(td->td_proc);
+ if (map->size + size > lim_cur(td, RLIMIT_VMEM)) {
+ RACCT_PROC_UNLOCK(td->td_proc);
+ return (ENOMEM);
+ }
+ if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
+ RACCT_PROC_UNLOCK(td->td_proc);
+ return (ENOMEM);
+ }
+ if (!old_mlock && map->flags & MAP_WIREFUTURE) {
+ if (ptoa(pmap_wired_count(map->pmap)) + size >
+ lim_cur(td, RLIMIT_MEMLOCK)) {
+ racct_set_force(td->td_proc, RACCT_VMEM, map->size);
+ RACCT_PROC_UNLOCK(td->td_proc);
+ return (ENOMEM);
+ }
+ error = racct_set(td->td_proc, RACCT_MEMLOCK,
+ ptoa(pmap_wired_count(map->pmap)) + size);
+ if (error != 0) {
+ racct_set_force(td->td_proc, RACCT_VMEM, map->size);
+ RACCT_PROC_UNLOCK(td->td_proc);
+ return (error);
+ }
+ }
+ RACCT_PROC_UNLOCK(td->td_proc);
+ return (0);
+}
+
/*
* Internal version of mmap that maps a specific VM object into an
* map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
@@ -1519,39 +1554,15 @@
vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
boolean_t writecounted, struct thread *td)
{
- boolean_t curmap, fitit;
vm_offset_t max_addr;
int docow, error, findspace, rv;
+ bool curmap, fitit;
curmap = map == &td->td_proc->p_vmspace->vm_map;
if (curmap) {
- RACCT_PROC_LOCK(td->td_proc);
- if (map->size + size > lim_cur(td, RLIMIT_VMEM)) {
- RACCT_PROC_UNLOCK(td->td_proc);
- return (ENOMEM);
- }
- if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
- RACCT_PROC_UNLOCK(td->td_proc);
- return (ENOMEM);
- }
- if (!old_mlock && map->flags & MAP_WIREFUTURE) {
- if (ptoa(pmap_wired_count(map->pmap)) + size >
- lim_cur(td, RLIMIT_MEMLOCK)) {
- racct_set_force(td->td_proc, RACCT_VMEM,
- map->size);
- RACCT_PROC_UNLOCK(td->td_proc);
- return (ENOMEM);
- }
- error = racct_set(td->td_proc, RACCT_MEMLOCK,
- ptoa(pmap_wired_count(map->pmap)) + size);
- if (error != 0) {
- racct_set_force(td->td_proc, RACCT_VMEM,
- map->size);
- RACCT_PROC_UNLOCK(td->td_proc);
- return (error);
- }
- }
- RACCT_PROC_UNLOCK(td->td_proc);
+ error = kern_mmap_racct_check(td, map, size);
+ if (error != 0)
+ return (error);
}
/*
Index: sys/vm/vm_object.h
===================================================================
--- sys/vm/vm_object.h
+++ sys/vm/vm_object.h
@@ -173,6 +173,17 @@
struct pctrie swp_blks;
vm_ooffset_t writemappings;
} swp;
+
+ /*
+ * Phys pager
+ */
+ struct {
+ struct phys_pager_ops *ops;
+ union {
+ void *data_ptr;
+ uintptr_t data_val;
+ };
+ } phys;
} un_pager;
struct ucred *cred;
vm_ooffset_t charge;
Index: sys/vm/vm_object.c
===================================================================
--- sys/vm/vm_object.c
+++ sys/vm/vm_object.c
@@ -289,6 +289,7 @@
kernel_object->flags |= OBJ_COLORED;
kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
#endif
+ kernel_object->un_pager.phys.ops = &default_phys_pg_ops;
/*
* The lock portion of struct vm_object must be type stable due
Index: sys/vm/vm_page.c
===================================================================
--- sys/vm/vm_page.c
+++ sys/vm/vm_page.c
@@ -3117,9 +3117,12 @@
return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters);
}
-void
-vm_wait_doms(const domainset_t *wdoms)
+int
+vm_wait_doms(const domainset_t *wdoms, int mflags)
{
+ int error;
+
+ error = 0;
/*
* We use racey wakeup synchronization to avoid expensive global
@@ -3132,8 +3135,8 @@
if (curproc == pageproc) {
mtx_lock(&vm_domainset_lock);
vm_pageproc_waiters++;
- msleep(&vm_pageproc_waiters, &vm_domainset_lock, PVM | PDROP,
- "pageprocwait", 1);
+ error = msleep(&vm_pageproc_waiters, &vm_domainset_lock,
+ PVM | PDROP | mflags, "pageprocwait", 1);
} else {
/*
* XXX Ideally we would wait only until the allocation could
@@ -3143,11 +3146,12 @@
mtx_lock(&vm_domainset_lock);
if (vm_page_count_min_set(wdoms)) {
vm_min_waiters++;
- msleep(&vm_min_domains, &vm_domainset_lock,
- PVM | PDROP, "vmwait", 0);
+ error = msleep(&vm_min_domains, &vm_domainset_lock,
+ PVM | PDROP | mflags, "vmwait", 0);
} else
mtx_unlock(&vm_domainset_lock);
}
+ return (error);
}
/*
@@ -3178,20 +3182,12 @@
panic("vm_wait in early boot");
DOMAINSET_ZERO(&wdom);
DOMAINSET_SET(vmd->vmd_domain, &wdom);
- vm_wait_doms(&wdom);
+ vm_wait_doms(&wdom, 0);
}
}
-/*
- * vm_wait:
- *
- * Sleep until free pages are available for allocation in the
- * affinity domains of the obj. If obj is NULL, the domain set
- * for the calling thread is used.
- * Called in various places after failed memory allocations.
- */
-void
-vm_wait(vm_object_t obj)
+static int
+vm_wait_flags(vm_object_t obj, int mflags)
{
struct domainset *d;
@@ -3206,7 +3202,27 @@
if (d == NULL)
d = curthread->td_domain.dr_policy;
- vm_wait_doms(&d->ds_mask);
+ return (vm_wait_doms(&d->ds_mask, mflags));
+}
+
+/*
+ * vm_wait:
+ *
+ * Sleep until free pages are available for allocation in the
+ * affinity domains of the obj. If obj is NULL, the domain set
+ * for the calling thread is used.
+ * Called in various places after failed memory allocations.
+ */
+void
+vm_wait(vm_object_t obj)
+{
+ (void)vm_wait_flags(obj, 0);
+}
+
+int
+vm_wait_intr(vm_object_t obj)
+{
+ return (vm_wait_flags(obj, PCATCH));
}
/*
Index: sys/vm/vm_pageout.h
===================================================================
--- sys/vm/vm_pageout.h
+++ sys/vm/vm_pageout.h
@@ -97,6 +97,7 @@
*/
void vm_wait(vm_object_t obj);
+int vm_wait_intr(vm_object_t obj);
void vm_waitpfault(struct domainset *, int timo);
void vm_wait_domain(int domain);
void vm_wait_min(void);
Index: sys/vm/vm_pager.h
===================================================================
--- sys/vm/vm_pager.h
+++ sys/vm/vm_pager.h
@@ -229,5 +229,22 @@
vm_object_t cdev_pager_lookup(void *handle);
void cdev_pager_free_page(vm_object_t object, vm_page_t m);
+struct phys_pager_ops {
+ int (*phys_pg_getpages)(vm_object_t vm_obj, vm_page_t *m, int count,
+ int *rbehind, int *rahead);
+ int (*phys_pg_populate)(vm_object_t vm_obj, vm_pindex_t pidx,
+ int fault_type, vm_prot_t max_prot, vm_pindex_t *first,
+ vm_pindex_t *last);
+ boolean_t (*phys_pg_haspage)(vm_object_t obj, vm_pindex_t pindex,
+ int *before, int *after);
+ void (*phys_pg_ctor)(vm_object_t vm_obj, vm_prot_t prot,
+ vm_ooffset_t foff, struct ucred *cred);
+ void (*phys_pg_dtor)(vm_object_t vm_obj);
+};
+extern struct phys_pager_ops default_phys_pg_ops;
+vm_object_t phys_pager_allocate(void *handle, struct phys_pager_ops *ops,
+ void *data, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff,
+ struct ucred *cred);
+
#endif /* _KERNEL */
#endif /* _VM_PAGER_ */
Index: usr.bin/posixshmcontrol/posixshmcontrol.c
===================================================================
--- usr.bin/posixshmcontrol/posixshmcontrol.c
+++ usr.bin/posixshmcontrol/posixshmcontrol.c
@@ -30,8 +30,10 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
+#include <sys/filio.h>
#include <sys/mman.h>
#include <sys/stat.h>
+#include <sys/syscall.h>
#include <sys/sysctl.h>
#include <sys/user.h>
#include <err.h>
@@ -50,7 +52,7 @@
{
fprintf(stderr, "Usage:\n"
- "posixshmcontrol create [-m <mode>] <path> ...\n"
+ "posixshmcontrol create [-m <mode>] [-l largepage] <path> ...\n"
"posixshmcontrol rm <path> ...\n"
"posixshmcontrol ls [-h] [-n]\n"
"posixshmcontrol dump <path> ...\n"
@@ -59,14 +61,28 @@
}
static int
-create_one_shm(const char *path, long mode)
+create_one_shm(const char *path, long mode, int idx)
{
- int fd;
+ struct shm_largepage_conf slc;
+ int error, fd;
- fd = shm_open(path, O_RDWR | O_CREAT, mode);
- if (fd == -1) {
- warn("create %s", path);
- return (1);
+ if (idx == -1) {
+ fd = shm_open(path, O_RDWR | O_CREAT, mode);
+ if (fd == -1) {
+ warn("create %s", path);
+ return (1);
+ }
+ } else {
+ fd = syscall(SYS_shm_open2, path, O_CREAT | O_RDWR, 0666,
+ SHM_LARGEPAGE, (void *)NULL);
+ if (fd == -1)
+ warn("shm_open2 %s", path);
+ memset(&slc, 0, sizeof(slc));
+ slc.psind = idx;
+ slc.alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT;
+ error = ioctl(fd, FIOSHMLPGCNF, &slc);
+ if (error == -1)
+ err(1, "FIOSHMLPGCNF");
}
close(fd);
return (0);
@@ -76,11 +92,15 @@
create_shm(int argc, char **argv)
{
char *end;
+ size_t *pagesizes;
long mode;
- int c, i, ret, ret1;
+ uint64_t pgsz;
+ int c, i, idx, pn, ret, ret1;
+ bool printed;
mode = 0600;
- while ((c = getopt(argc, argv, "m:")) != -1) {
+ idx = -1;
+ while ((c = getopt(argc, argv, "l:m:")) != -1) {
switch (c) {
case 'm':
errno = 0;
@@ -90,6 +110,42 @@
if (*end != '\0')
errx(1, "non-integer mode");
break;
+ case 'l':
+ if (expand_number(optarg, &pgsz) == -1)
+ err(1, "size:");
+ pn = getpagesizes(NULL, 0);
+ if (pn == -1)
+ err(1, "getpagesizes");
+ pagesizes = malloc(sizeof(size_t) * pn);
+ if (pagesizes == NULL)
+ err(1, "malloc");
+ if (getpagesizes(pagesizes, pn) == -1)
+ err(1, "gtpagesizes");
+ for (idx = 0; idx < pn; idx++) {
+ if (pagesizes[idx] == pgsz)
+ break;
+ }
+ if (idx == pn) {
+ fprintf(stderr,
+ "pagesize should be superpagesize, supported sizes:");
+ printed = false;
+ for (i = 0; i < pn; i++) {
+ if (pagesizes[i] == 0 ||
+ pagesizes[i] == (size_t)
+ getpagesize())
+ continue;
+ printed = true;
+ fprintf(stderr, " %zu", pagesizes[i]);
+ }
+ if (!printed)
+ fprintf(stderr, " none");
+ fprintf(stderr, "\n");
+ exit(1);
+ }
+ if (pgsz == (uint64_t)getpagesize())
+ errx(1, "pagesize should be large");
+ free(pagesizes);
+ break;
case '?':
default:
usage();
@@ -101,7 +157,7 @@
argv += optind;
ret = 0;
for (i = 0; i < argc; i++) {
- ret1 = create_one_shm(argv[i], mode);
+ ret1 = create_one_shm(argv[i], mode, idx);
if (ret1 != 0 && ret == 0)
ret = ret1;
}
@@ -349,6 +405,9 @@
(long)st.st_ctim.tv_nsec);
printf("birth\t%ld.%09ld\n", (long)st.st_birthtim.tv_sec,
(long)st.st_birthtim.tv_nsec);
+ if (st.st_blocks != 0)
+ printf("pagesz\t%jd\n", roundup((uintmax_t)st.st_size,
+ PAGE_SIZE) / st.st_blocks);
}
close(fd);
return (ret);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Nov 12, 3:31 PM (16 h, 20 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
25217442
Default Alt Text
D24652.id74286.diff (71 KB)
Attached To
Mode
D24652: Non-transparent superpages support.
Attached
Detach File
Event Timeline
Log In to Comment