Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F140697850
D24652.id71372.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
61 KB
Referenced Files
None
Subscribers
None
D24652.id71372.diff
View Options
Index: TODO
===================================================================
--- /dev/null
+++ TODO
@@ -0,0 +1,12 @@
+- sysctl for total allocated superpages memory
+- per-user limit on the total superpages allocations
+- posix shm API instead (?) of /dev/hugetlb
+- handle 1G PG_PS in other places of pmap
+- make pmap_superpagesizes[] per-pmap ?
+- more test programs
+
+<kib> 1. I either add a new pager type or allow to specify populate method, and make sure that vm_fault_populate() can cope
+<kib> 2. I will not expose pmap_enter_largepage() but I still want to keep it
+<kib> 3. there will be a new SHM flag to shm_open2() that opens special large-page shm, with phys pager backing, and some defaults
+<kib> 4. I do not need even a new mmap flag, if vm_mmap detects this object it should do the right thing, I just need to stash psind somewhere
+<kib> 5. shm pages will be instantiated with ftruncate() which must be done before mmap
\ No newline at end of file
Index: sys/amd64/amd64/pmap.c
===================================================================
--- sys/amd64/amd64/pmap.c
+++ sys/amd64/amd64/pmap.c
@@ -2136,6 +2136,11 @@
KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
("pmap_init: can't assign to pagesizes[1]"));
pagesizes[1] = NBPDR;
+ if ((amd_feature & AMDID_PAGE1GB) != 0) {
+ KASSERT(MAXPAGESIZES > 1 && pagesizes[2] == 0,
+ ("pmap_init: can't assign to pagesizes[2]"));
+ pagesizes[2] = NBPDP;
+ }
}
/*
@@ -3780,6 +3785,19 @@
* one or two pages may be held during the wait, only to be released
* afterwards. This conservative approach is easily argued to avoid
* race conditions.
+ *
+ * Page table entry at address va page index is defined as follows:
+ * - for page table (last level), ptepindex = pmap_pde_pindex(va) =
+ * = va >> PDRSHIFT, in other words, it is just the index of the PDE.
+ * - for page directory page, ptepindex = NUPDE (number of userland PD
+ * entries) + (pmap_pde_index(va) >> NPDEPGSHIFT)
+ * i.e. index of PDPE is put after the last index of PDE,
+ * - for page directory pointer page, ptepindex = NUPDE + NUPDPE +
+ * (pmap_pde_index(va) >> (NPDEPGSHIFT + NPML4EPGSHIFT),
+ * i.e. index of pml4e is put after the last index of PDPE.
+ * In other words, is it sequential number of the corresponding paging entry
+ * in the order where all entries of the same rank are put together, then
+ * ranks are put from deepest to root.
*/
static vm_page_t
_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
@@ -5395,6 +5413,7 @@
pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
struct rwlock *lock;
+ vm_page_t mt;
vm_offset_t va_next;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
@@ -5447,13 +5466,28 @@
}
pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
+ va_next = (sva + NBPDP) & ~PDPMASK;
if ((*pdpe & PG_V) == 0) {
- va_next = (sva + NBPDP) & ~PDPMASK;
if (va_next < sva)
va_next = eva;
continue;
}
+ KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva,
+ ("pmap_remove of non-transient 1G page "
+ "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
+ *pdpe, sva, eva, va_next));
+ if ((*pdpe & PG_PS) != 0) {
+ MPASS(pmap != kernel_pmap); /* XXXKIB */
+ MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0);
+ anyvalid = 1;
+ *pdpe = 0;
+ pmap_resident_count_dec(pmap, NBPDP / PAGE_SIZE);
+ mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME);
+ pmap_unwire_ptp(pmap, sva, mt, &free);
+ continue;
+ }
+
/*
* Calculate index for next page table.
*/
@@ -5669,11 +5703,13 @@
void
pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
{
+ vm_page_t m;
vm_offset_t va_next;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t ptpaddr, *pde;
pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
+ pt_entry_t obits, pbits;
boolean_t anychanged;
KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
@@ -5724,13 +5760,44 @@
}
pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
+ va_next = (sva + NBPDP) & ~PDPMASK;
if ((*pdpe & PG_V) == 0) {
- va_next = (sva + NBPDP) & ~PDPMASK;
if (va_next < sva)
va_next = eva;
continue;
}
+ KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva,
+ ("pmap_remove of non-transient 1G page "
+ "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
+ *pdpe, sva, eva, va_next));
+ if ((*pdpe & PG_PS) != 0) {
+retry_pdpe:
+ /*
+ * Must not change protection lazily, we do
+ * not handle page faults on 1G superpages.
+ */
+ obits = pbits = *pdpe;
+ MPASS((pbits & (PG_MANAGED | PG_G)) == 0);
+ MPASS(pmap != kernel_pmap); /* XXXKIB */
+ if ((prot & VM_PROT_WRITE) == 0)
+ pbits &= ~(PG_RW | PG_M);
+ else
+ pbits |= PG_RW | PG_M;
+ if ((prot & VM_PROT_EXECUTE) == 0)
+ pbits |= pg_nx;
+ else
+ pbits &= ~pg_nx;
+
+ if (pbits != obits) {
+ if (!atomic_cmpset_long(pdpe, obits, pbits))
+ /* PG_PS cannot be cleared under us, */
+ goto retry_pdpe;
+ anychanged = TRUE;
+ }
+ continue;
+ }
+
va_next = (sva + NBPDR) & ~PDRMASK;
if (va_next < sva)
va_next = eva;
@@ -5773,9 +5840,6 @@
for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
sva += PAGE_SIZE) {
- pt_entry_t obits, pbits;
- vm_page_t m;
-
retry:
obits = pbits = *pte;
if ((pbits & PG_V) == 0)
@@ -5950,6 +6014,95 @@
}
#endif /* VM_NRESERVLEVEL > 0 */
+static int
+pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags,
+ int psind)
+{
+ vm_page_t mp;
+ pt_entry_t *pml4e, *pdpe, *pde, pten, PG_V;
+ vm_pindex_t ptepindex;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ KASSERT(psind > 0 && psind < MAXPAGESIZES, ("XXX"));
+ KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0,
+ ("XXX"));
+ KASSERT((va & (pagesizes[psind] - 1)) == 0,
+ ("XXX"));
+ KASSERT(va < VM_MAXUSER_ADDRESS, ("XXX")); /* XXXKIB */
+ KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS,
+ ("XXX")); /* XXXKIB */
+
+ PG_V = pmap_valid_bit(pmap);
+
+restart:
+ pten = newpte;
+ if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86)
+ pten |= pmap_pkru_get(pmap, va);
+
+ ptepindex = pmap_pde_pindex(va);
+
+ if (psind == 2) { /* 1G */
+ pml4e = pmap_pml4e(pmap, va);
+ if ((*pml4e & PG_V) == 0) {
+ mp = _pmap_allocpte(pmap, NUPDE + NUPDPE +
+ ((ptepindex - NUPDE) >> NPML4EPGSHIFT), NULL);
+ if (mp == NULL) {
+ if ((flags & PMAP_ENTER_NOSLEEP) != 0)
+ return (KERN_RESOURCE_SHORTAGE);
+ PMAP_UNLOCK(pmap);
+ vm_wait(NULL);
+ PMAP_LOCK(pmap);
+
+ /*
+ * Restart at least to recalcuate the pkru
+ * key. Our caller must keep the map locked
+ * so no paging structure can be validated
+ * under us.
+ */
+ goto restart;
+ }
+ } else {
+ mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME);
+ mp->ref_count++;
+ }
+ pdpe = pmap_pdpe(pmap, va);
+ KASSERT(pdpe != NULL, ("XXX"));
+ KASSERT((*pdpe & PG_V) == 0 || ((*pdpe & PG_PS) != 0 &&
+ (*pdpe & PG_FRAME) == (newpte & PG_FRAME)),
+ ("XXX"));
+ *pdpe = newpte;
+ } else /* (psind == 1) */ { /* 2M, keep for now */
+ pde = pmap_pde(pmap, va);
+ if (pde == NULL) {
+ mp = _pmap_allocpte(pmap, NUPDE +
+ (ptepindex >> NPDPEPGSHIFT), NULL);
+ if (mp == NULL) {
+ if ((flags & PMAP_ENTER_NOSLEEP) != 0)
+ return (KERN_RESOURCE_SHORTAGE);
+ PMAP_UNLOCK(pmap);
+ vm_wait(NULL);
+ PMAP_LOCK(pmap);
+ goto restart;
+ }
+ pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
+ pde = &pde[pmap_pde_index(va)];
+ } else {
+ pdpe = pmap_pdpe(pmap, va);
+ MPASS(pdpe != NULL && (*pdpe & PG_V) != 0);
+ mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
+ mp->ref_count++;
+ }
+ KASSERT(pde != NULL, ("XXX"));
+ KASSERT((*pde & PG_V) == 0 || ((*pde & PG_PS) != 0 &&
+ (*pde & PG_FRAME) == (newpte & PG_FRAME)),
+ ("XXX"));
+ *pde = newpte;
+ }
+ pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
+
+ return (KERN_SUCCESS);
+}
+
/*
* Insert the given physical page (p) at
* the specified virtual address (v) in the
@@ -6029,6 +6182,12 @@
lock = NULL;
PMAP_LOCK(pmap);
+ if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
+ KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("XXX"));
+ rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags,
+ psind);
+ goto out;
+ }
if (psind == 1) {
/* Assert the required virtual and physical alignment. */
KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
Index: sys/dev/hugetlb/hugetlb.h
===================================================================
--- /dev/null
+++ sys/dev/hugetlb/hugetlb.h
@@ -0,0 +1,36 @@
+/**
+ *
+ */
+
+#ifndef _SYS_DEV_HUGETLB_H
+#define _SYS_DEV_HUGETLB_H
+
+#include <sys/types.h>
+#include <sys/ioccom.h>
+
+#define _PATH_DEVHUGETLB "/dev/hugetlb"
+
+struct hugetlb_sizes {
+ u_int sizes_nitems;
+ u_int *sizes;
+};
+
+struct hugetlb_mmap {
+ int domain;
+ u_int superpage_index;
+ u_int flags;
+ u_int prot;
+ size_t size;
+ void *addr;
+};
+
+#define HUGETLB_MMAP_FIXED 0x0001
+#define HUGETLB_MMAP_EXCL 0x0002
+#define HUGETLB_MMAP_CORE 0x0004
+#define HUGETLB_MMAP_NOWAIT 0x0008
+#define HUGETLB_MMAP_WAITHARD 0x0010
+
+#define HUGETLB_SIZES _IOWR('H', 1, struct hugetlb_sizes)
+#define HUGETLB_MMAP _IOWR('H', 2, struct hugetlb_mmap)
+
+#endif
Index: sys/dev/hugetlb/hugetlb.c
===================================================================
--- /dev/null
+++ sys/dev/hugetlb/hugetlb.c
@@ -0,0 +1,374 @@
+/**
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/abi_compat.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/syscallsubr.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
+
+#include <dev/hugetlb/hugetlb.h>
+
+static int
+hugetlb_sizes_handler(struct thread *td, struct hugetlb_sizes *hs)
+{
+ u_int i;
+ int error;
+
+ error = 0;
+ if (hs->sizes != NULL) {
+ for (i = 0; i < pmap_superpagesize_nitems &&
+ i < hs->sizes_nitems; i++) {
+ error = copyout(&pmap_superpagesize[i], hs->sizes + i,
+ sizeof(u_int));
+ if (error != 0)
+ break;
+ }
+ }
+ hs->sizes_nitems = pmap_superpagesize_nitems;
+ return (error);
+}
+
+static int
+hugetlb_mmap_alloc_obj(struct thread *td, vm_object_t *objp, u_long sp_size,
+ u_int sp_index, vm_offset_t size, int domain, u_int flags)
+{
+ vm_object_t obj;
+ vm_page_t m;
+ vm_offset_t a;
+ u_int aflags, i;
+ int error;
+
+ obj = vm_pager_allocate(OBJT_PHYS, NULL, OFF_TO_IDX(size), VM_PROT_ALL,
+ 0, td->td_ucred);
+
+ aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO;
+ if ((flags & (HUGETLB_MMAP_NOWAIT | HUGETLB_MMAP_WAITHARD)) == 0)
+ aflags |= VM_ALLOC_WAITFAIL;
+ a = 0;
+restart:
+ VM_OBJECT_WLOCK(obj);
+ for (; a < size; a += sp_size) {
+ if (domain == -1) {
+ m = vm_page_alloc_contig(obj, a / PAGE_SIZE,
+ aflags,
+ sp_size / PAGE_SIZE, 0, ~0,
+ sp_size, 0, VM_MEMATTR_DEFAULT);
+ } else {
+ m = vm_page_alloc_contig_domain(obj, a / PAGE_SIZE,
+ domain, aflags,
+ sp_size / PAGE_SIZE, 0, ~0,
+ sp_size, 0, VM_MEMATTR_DEFAULT);
+ }
+ if (m == NULL) {
+ VM_OBJECT_WUNLOCK(obj);
+ if ((flags & HUGETLB_MMAP_NOWAIT) != 0) {
+ error = ENOMEM;
+ goto fail;
+ }
+ if ((flags & HUGETLB_MMAP_WAITHARD) != 0) {
+ if (domain == -1) {
+ if (!vm_page_reclaim_contig(aflags,
+ sp_size / PAGE_SIZE,
+ 0, ~0, sp_size, 0))
+ /* XXXKIB */
+ vm_wait_domain(domain);
+ } else {
+ if (!vm_page_reclaim_contig_domain(
+ domain, aflags,
+ sp_size / PAGE_SIZE,
+ 0, ~0, sp_size, 0))
+ vm_wait_domain(domain);
+ }
+ } else {
+ vm_wait(obj);
+ }
+ error = thread_check_susp(td, false);
+ if (error != 0)
+ goto fail;
+ goto restart;
+ }
+ for (i = 0; i < sp_size / PAGE_SIZE; i++) {
+ if ((m[i].flags & PG_ZERO) == 0)
+ pmap_zero_page(&m[i]);
+ vm_page_valid(&m[i]);
+ vm_page_xunbusy(&m[i]);
+ }
+ }
+ VM_OBJECT_WUNLOCK(obj);
+
+ *objp = obj;
+ return (0);
+
+fail:
+ vm_object_deallocate(obj);
+ return (error);
+}
+
+static int
+hugetlb_mmap_handler(struct thread *td, struct hugetlb_mmap *hm)
+{
+ vm_map_t map;
+ pmap_t pmap;
+ vm_object_t obj;
+ vm_map_entry_t next_entry, prev_entry;
+ vm_page_t m;
+ vm_offset_t addr, sp_mask;
+ vm_pindex_t pi;
+ int error, max_prot, prot, rv, try;
+
+ map = &td->td_proc->p_vmspace->vm_map;
+ pmap = vmspace_pmap(td->td_proc->p_vmspace);
+
+ if ((hm->flags & ~(HUGETLB_MMAP_FIXED | HUGETLB_MMAP_CORE |
+ HUGETLB_MMAP_NOWAIT | HUGETLB_MMAP_WAITHARD)) != 0 ||
+ (hm->flags & (HUGETLB_MMAP_NOWAIT | HUGETLB_MMAP_WAITHARD)) ==
+ (HUGETLB_MMAP_NOWAIT | HUGETLB_MMAP_WAITHARD) ||
+ hm->superpage_index >= pmap_superpagesize_nitems ||
+ (hm->domain != -1 && (hm->domain < 0 || hm->domain >= vm_ndomains)))
+ return (EINVAL);
+ sp_mask = pmap_superpagesize[hm->superpage_index] - 1;
+ addr = (vm_offset_t)hm->addr;
+ if (hm->size == 0 || (hm->size & sp_mask) != 0 ||
+ (addr != 0 && (addr & sp_mask) != 0) ||
+ addr + hm->size < addr || addr + hm->size > VM_MAXUSER_ADDRESS)
+ return (EINVAL);
+ if ((hm->prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0)
+ return (EINVAL);
+ max_prot = PROT_MAX_EXTRACT(hm->prot);
+ prot = PROT_EXTRACT(hm->prot);
+ if (max_prot != 0 && (max_prot & prot) != prot)
+ return (ENOTSUP);
+ if (prot == 0)
+ return (EINVAL);
+ if (max_prot == 0)
+ max_prot = kern_mmap_maxprot(td->td_proc, prot);
+
+ /* XXXKIB limit checks ? */
+
+ error = hugetlb_mmap_alloc_obj(td, &obj,
+ pmap_superpagesize[hm->superpage_index],
+ hm->superpage_index, hm->size, hm->domain, hm->flags);
+ if (error != 0)
+ return (error);
+
+ vm_map_lock(map);
+ if ((hm->flags & HUGETLB_MMAP_FIXED) == 0) {
+ try = 1;
+ if (addr == 0) {
+ addr = roundup2((vm_offset_t)td->td_proc->p_vmspace->
+ vm_daddr + lim_max(td, RLIMIT_DATA),
+ pmap_superpagesize[hm->superpage_index]);
+ }
+again:
+ rv = vm_map_find_aligned(map, &addr, hm->size, vm_map_max(map),
+ pmap_superpagesize[hm->superpage_index]);
+ if (rv != KERN_SUCCESS) {
+ if (try == 1) {
+ try = 2;
+ addr = vm_map_min(map);
+ if ((addr & sp_mask) != 0)
+ addr = (addr + sp_mask) & sp_mask;
+ goto again;
+ }
+ goto fail;
+ }
+ } else if ((hm->flags & HUGETLB_MMAP_EXCL) == 0) {
+ vm_map_delete(map, addr, addr + hm->size);
+ } else {
+ rv = KERN_NO_SPACE;
+ if (vm_map_lookup_entry(map, addr, &prev_entry))
+ goto fail;
+ next_entry = vm_map_entry_succ(prev_entry);
+ if (next_entry->start < addr + hm->size)
+ goto fail;
+ }
+
+ /*
+ * Insert the mapping into pmap before creating the map entry.
+ * If buggy userspace accesses the allocated region before we
+ * populated page tables, fault must not instantiate any pte.
+ */
+ for (pi = 0; pi < OFF_TO_IDX(hm->size);
+ pi += OFF_TO_IDX(pmap_superpagesize[hm->superpage_index])) {
+ VM_OBJECT_RLOCK(obj);
+ m = vm_page_lookup(obj, pi);
+ VM_OBJECT_RUNLOCK(obj);
+ MPASS(m != NULL);
+ rv = pmap_enter_hugetlb(pmap, hm->superpage_index, m,
+ addr + IDX_TO_OFF(pi), prot);
+ if (rv != KERN_SUCCESS)
+ goto fail1;
+ }
+
+ rv = vm_map_insert(map, obj, 0, addr, addr + hm->size, prot,
+ max_prot, MAP_INHERIT_SHARE |
+ ((hm->superpage_index + 1) << MAP_SPLIT_BOUNDARY_SHIFT) |
+ ((hm->flags & HUGETLB_MMAP_CORE) != 0 ? 0 : MAP_DISABLE_COREDUMP));
+ if (rv != KERN_SUCCESS)
+ goto fail1;
+ vm_map_unlock(map);
+ hm->addr = (void *)addr;
+ return (0);
+
+fail1:
+ pmap_remove(pmap, addr, addr + hm->size);
+fail:
+ vm_map_unlock(map);
+ vm_object_deallocate(obj);
+ return (vm_mmap_to_errno(rv));
+}
+
+#ifdef COMPAT_FREEBSD32
+struct hugetlb_mmap32 {
+ int domain;
+ u_int superpage_index;
+ u_int flags;
+ u_int prot;
+ uint32_t size;
+ uint32_t addr;
+};
+
+#define HUGETLB_MMAP32 _IOWR('H', 2, struct hugetlb_mmap32)
+#endif
+
+static int
+hugetlb_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data,
+ int fflag, struct thread *td)
+{
+#ifdef COMPAT_FREEBSD32
+ struct hugetlb_mmap32 *hm32;
+ struct hugetlb_mmap hm;
+#endif
+ int error;
+
+ switch (cmd) {
+ case HUGETLB_SIZES:
+ error = hugetlb_sizes_handler(td, (struct hugetlb_sizes *)data);
+ break;
+ case HUGETLB_MMAP:
+ if ((fflag & FWRITE) == 0)
+ return (EPERM);
+ error = hugetlb_mmap_handler(td, (struct hugetlb_mmap *)data);
+ break;
+#ifdef COMPAT_FREEBSD32
+ case HUGETLB_MMAP32:
+ if ((fflag & FWRITE) == 0)
+ return (EPERM);
+ hm32 = (struct hugetlb_mmap32 *)data;
+ CP(*hm32, hm, domain);
+ CP(*hm32, hm, superpage_index);
+ CP(*hm32, hm, flags);
+ CP(*hm32, hm, prot);
+ CP(*hm32, hm, size);
+ PTRIN_CP(*hm32, hm, addr);
+ error = hugetlb_mmap_handler(td, &hm);
+ if (error == 0)
+ PTROUT_CP(hm, *hm32, addr);
+ break;
+#endif
+ default:
+ error = ENOTTY;
+ break;
+ }
+ return (error);
+}
+
+static struct cdevsw hugetlb_cdevsw = {
+ .d_version = D_VERSION,
+ .d_ioctl = hugetlb_ioctl,
+};
+static struct cdev *hugetlb_cdev;
+
+static int
+hugetlb_init(void)
+{
+ struct make_dev_args mda;
+ u_int i;
+ int error;
+
+ if (pmap_superpagesize_nitems > 0) {
+ if (bootverbose) {
+ printf("hugetlb device: %u pagesizes:",
+ pmap_superpagesize_nitems);
+ for (i = 0; i < pmap_superpagesize_nitems; i++)
+ printf(" %#lx", pmap_superpagesize[i]);
+ printf("\n");
+ }
+ } else {
+ printf("hugetlb: superpages are not supported\n");
+ return (ENOTTY);
+ }
+
+ make_dev_args_init(&mda);
+ mda.mda_devsw = &hugetlb_cdevsw;
+ mda.mda_uid = UID_ROOT;
+ mda.mda_gid = GID_WHEEL;
+ mda.mda_mode = 0644;
+ error = make_dev_s(&mda, &hugetlb_cdev, "hugetlb");
+ if (error != 0) {
+ printf("could not create /dev/hugetlb, error %d\n", error);
+ return (error);
+ }
+
+ return (0);
+}
+
+static void
+hugetlb_fini(void)
+{
+ if (hugetlb_cdev != NULL)
+ destroy_dev(hugetlb_cdev);
+}
+
+static int
+hugetlb_modload(struct module *module, int cmd, void *arg __unused)
+{
+ int error;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ error = hugetlb_init();
+ if (error != 0)
+ hugetlb_fini();
+ break;
+ case MOD_UNLOAD:
+ hugetlb_fini();
+ error = 0;
+ break;
+ default:
+ error = 0;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t hugetlb_mod = {
+ "hugetlb",
+ hugetlb_modload,
+};
+DECLARE_MODULE(hugetlb, hugetlb_mod, SI_SUB_VFS, SI_ORDER_MIDDLE);
+MODULE_VERSION(hugetlb, 1);
Index: sys/dev/ksyms/ksyms.c
===================================================================
--- sys/dev/ksyms/ksyms.c
+++ sys/dev/ksyms/ksyms.c
@@ -41,6 +41,7 @@
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/uio.h>
@@ -51,6 +52,8 @@
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
#include "linker_if.h"
@@ -442,8 +445,8 @@
ksyms_size_calc(&ts);
elfsz = sizeof(struct ksyms_hdr) + ts.ts_symsz + ts.ts_strsz;
- object = vm_object_allocate(OBJT_PHYS,
- OFF_TO_IDX(round_page(elfsz)));
+ object = vm_pager_allocate(OBJT_PHYS, NULL, round_page(elfsz),
+ VM_PROT_ALL, 0, td->td_ucred);
sc->sc_obj = object;
sc->sc_objsz = elfsz;
Index: sys/dev/xen/gntdev/gntdev.c
===================================================================
--- sys/dev/xen/gntdev/gntdev.c
+++ sys/dev/xen/gntdev/gntdev.c
@@ -1068,7 +1068,8 @@
vm_object_t mem_obj;
struct gntdev_gref *gref;
- mem_obj = vm_object_allocate(OBJT_PHYS, size);
+ mem_obj = vm_pager_allocate(OBJT_PHYS, NULL, size, VM_PROT_ALL, 0,
+ curthread->td_ucred);
if (mem_obj == NULL)
return (ENOMEM);
Index: sys/kern/kern_umtx.c
===================================================================
--- sys/kern/kern_umtx.c
+++ sys/kern/kern_umtx.c
@@ -3933,7 +3933,7 @@
reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
reg->ushm_refcnt = 1;
bcopy(key, ®->ushm_key, sizeof(*key));
- reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR);
+ reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR, false);
reg->ushm_cred = crhold(cred);
error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
if (error != 0) {
Index: sys/kern/link_elf.c
===================================================================
--- sys/kern/link_elf.c
+++ sys/kern/link_elf.c
@@ -1089,7 +1089,8 @@
ef = (elf_file_t) lf;
#ifdef SPARSE_MAPPING
- ef->object = vm_object_allocate(OBJT_PHYS, atop(mapsize));
+ ef->object = vm_pager_allocate(OBJT_PHYS, NULL, mapsize, VM_PROT_ALL,
+ 0, thread0.td_ucred);
if (ef->object == NULL) {
error = ENOMEM;
goto out;
Index: sys/kern/link_elf_obj.c
===================================================================
--- sys/kern/link_elf_obj.c
+++ sys/kern/link_elf_obj.c
@@ -34,16 +34,17 @@
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
+#include <sys/linker.h>
#include <sys/mutex.h>
#include <sys/mount.h>
-#include <sys/proc.h>
#include <sys/namei.h>
-#include <sys/fcntl.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
#include <sys/vnode.h>
-#include <sys/linker.h>
#include <machine/elf.h>
@@ -53,11 +54,13 @@
#include <vm/vm.h>
#include <vm/vm_param.h>
-#include <vm/vm_object.h>
-#include <vm/vm_kern.h>
-#include <vm/vm_extern.h>
#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
#include <sys/link_elf.h>
@@ -905,7 +908,8 @@
* This stuff needs to be in a single chunk so that profiling etc
* can get the bounds and gdb can associate offsets with modules
*/
- ef->object = vm_object_allocate(OBJT_PHYS, atop(round_page(mapsize)));
+ ef->object = vm_pager_allocate(OBJT_PHYS, NULL, round_page(mapsize),
+ VM_PROT_ALL, 0, thread0.td_ucred);
if (ef->object == NULL) {
error = ENOMEM;
goto out;
Index: sys/kern/uipc_shm.c
===================================================================
--- sys/kern/uipc_shm.c
+++ sys/kern/uipc_shm.c
@@ -159,7 +159,7 @@
.fo_get_seals = shm_get_seals,
.fo_add_seals = shm_add_seals,
.fo_fallocate = shm_fallocate,
- .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
+ .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE,
};
FEATURE(posix_shm, "POSIX shared memory");
@@ -242,6 +242,75 @@
return (error);
}
+static int
+shm_largepage_phys_populate(vm_object_t object, vm_pindex_t pidx,
+ int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
+{
+ struct shmfd *shmfd;
+ vm_page_t m;
+
+ shmfd = object->un_pager.phys.data;
+ if (shmfd->shm_lp_psind == 0 || pidx >= object->size)
+ return (VM_PAGER_FAIL);
+ *first = rounddown(pidx, pagesizes[shmfd->shm_lp_psind] / PAGE_SIZE);
+
+ /*
+ * We only busy the first page in the superpage run. It is
+ * useless to busy whole run since we only remove full
+ * superpage, and its take too long to busy e.g. 512 * 512 ==
+ * 262144 pages constituing 1G amd64 superage.
+ */
+ m = vm_page_grab(object, *first, VM_ALLOC_NORMAL);
+
+ *last = roundup(pidx, pagesizes[shmfd->shm_lp_psind] / PAGE_SIZE);
+ return (VM_PAGER_OK);
+}
+
+static boolean_t
+shm_largepage_phys_haspage(vm_object_t object, vm_pindex_t pindex,
+ int *before, int *after)
+{
+ struct shmfd *shmfd;
+
+ shmfd = object->un_pager.phys.data;
+ if (shmfd->shm_lp_psind == 0 ||pindex >= object->size)
+ return (FALSE);
+ if (before != NULL) {
+ *before = pindex - rounddown(pindex, pagesizes[
+ shmfd->shm_lp_psind] / PAGE_SIZE);
+ }
+ if (after != NULL) {
+ *after = roundup(pindex, pagesizes[shmfd->shm_lp_psind] /
+ PAGE_SIZE) - pindex;
+ }
+ return (TRUE);
+}
+
+static void
+shm_largepage_phys_ctor(vm_object_t object, vm_prot_t prot,
+ vm_ooffset_t foff, struct ucred *cred)
+{
+}
+
+static void
+shm_largepage_phys_dtor(vm_object_t object)
+{
+}
+
+static struct phys_pager_ops shm_largepage_phys_ops = {
+ .phys_pg_populate = shm_largepage_phys_populate,
+ .phys_pg_haspage = shm_largepage_phys_haspage,
+ .phys_pg_ctor = shm_largepage_phys_ctor,
+ .phys_pg_dtor = shm_largepage_phys_dtor,
+};
+
+static inline bool
+shm_largepage(struct shmfd *shmfd)
+{
+ return (shmfd->shm_object->type == OBJT_PHYS &&
+ shmfd->shm_object->un_pager.phys.ops == &shm_largepage_phys_ops);
+}
+
static int
shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
{
@@ -359,7 +428,11 @@
shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
struct thread *td)
{
+ struct shmfd *shmfd;
+ struct shm_largepage_conf *conf;
+ void *rl_cookie;
+ shmfd = fp->f_data;
switch (com) {
case FIONBIO:
case FIOASYNC:
@@ -368,6 +441,27 @@
* just like it would on an unlinked regular file
*/
return (0);
+ case FIOSHMLPGCNF:
+ if (!shm_largepage(shmfd))
+ return (ENOTTY);
+ conf = data;
+ if (shmfd->shm_lp_psind != 0 &&
+ conf->psind != shmfd->shm_lp_psind)
+ return (EINVAL);
+ if (conf->psind <= 0 || conf->psind >= MAXPAGESIZES ||
+ pagesizes[conf->psind] == 0)
+ return (EINVAL);
+ if (conf->alloc_policy != SHM_LARGEPAGE_ALLOC_DEFAULT &&
+ conf->alloc_policy != SHM_LARGEPAGE_ALLOC_NOWAIT &&
+ conf->alloc_policy != SHM_LARGEPAGE_ALLOC_HARD)
+ return (EINVAL);
+
+ rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
+ &shmfd->shm_mtx);
+ shmfd->shm_lp_psind = conf->psind;
+ shmfd->shm_lp_alloc_policy = conf->alloc_policy;
+ rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
+ return (0);
default:
return (ENOTTY);
}
@@ -571,6 +665,98 @@
return (0);
}
+static int
+shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie)
+{
+ vm_object_t object;
+ vm_page_t m;
+ vm_pindex_t a, a1, newobjsz, oldobjsz;
+ int aflags, error, i;
+
+ KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
+ object = shmfd->shm_object;
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ rangelock_cookie_assert(rl_cookie, RA_WLOCKED);
+
+ oldobjsz = object->size;
+ newobjsz = OFF_TO_IDX(length);
+ if (length == shmfd->shm_size)
+ return (0);
+ if (shmfd->shm_lp_psind == 0 && length != 0)
+ return (EINVAL);
+ if ((length & (pagesizes[shmfd->shm_lp_psind] - 1)) != 0)
+ return (EINVAL);
+
+ if (length < shmfd->shm_size) {
+ if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0)
+ return (EPERM);
+ if (shmfd->shm_kmappings > 0)
+ return (EBUSY);
+ return (ENOTSUP); /* Pages are unmanaged. */
+#if 0
+ vm_object_page_remove(object, newobjsz, oldobjsz, 0);
+ object->size = newobjsz;
+ shmfd->shm_size = length;
+ return (0);
+#endif
+ }
+
+ aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO;
+ if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT)
+ aflags |= VM_ALLOC_WAITFAIL;
+
+ a = oldobjsz;
+ for (; a < newobjsz;
+ a += OFF_TO_IDX(pagesizes[shmfd->shm_lp_psind])) {
+ m = vm_page_alloc_contig(object, a, aflags,
+ pagesizes[shmfd->shm_lp_psind] / PAGE_SIZE, 0, ~0,
+ pagesizes[shmfd->shm_lp_psind], 0,
+ VM_MEMATTR_DEFAULT);
+ if (m == NULL) {
+ VM_OBJECT_WUNLOCK(object);
+ if (shmfd->shm_lp_alloc_policy ==
+ SHM_LARGEPAGE_ALLOC_NOWAIT) {
+ error = ENOMEM;
+ goto fail;
+ }
+ if (shmfd->shm_lp_alloc_policy ==
+ SHM_LARGEPAGE_ALLOC_HARD) {
+ if (!vm_page_reclaim_contig(aflags,
+ pagesizes[shmfd->shm_lp_psind] /
+ PAGE_SIZE, 0, ~0,
+ pagesizes[shmfd->shm_lp_psind], 0))
+ vm_wait(object);
+ } else {
+ vm_wait(object);
+ }
+ error = thread_check_susp(curthread, false);
+ if (error != 0)
+ goto fail;
+ VM_OBJECT_WLOCK(object);
+ continue;
+ }
+ for (i = 0; i < pagesizes[shmfd->shm_lp_psind] / PAGE_SIZE;
+ i++) {
+ if ((m[i].flags & PG_ZERO) == 0)
+ pmap_zero_page(&m[i]);
+ vm_page_valid(&m[i]);
+ vm_page_xunbusy(&m[i]);
+ }
+ }
+ object->size = newobjsz;
+ shmfd->shm_size = length;
+ return (0);
+
+fail:
+ VM_OBJECT_WLOCK(object);
+ for (a1 = oldobjsz; a1 < a; a++) {
+ m = vm_page_lookup(object, a1);
+ vm_page_free(m);
+ }
+ VM_OBJECT_WUNLOCK(object);
+ return (error);
+}
+
int
shm_dotruncate(struct shmfd *shmfd, off_t length)
{
@@ -580,7 +766,10 @@
rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
&shmfd->shm_mtx);
VM_OBJECT_WLOCK(shmfd->shm_object);
- error = shm_dotruncate_locked(shmfd, length, rl_cookie);
+ if (shm_largepage(shmfd))
+ error = shm_dotruncate_largepage(shmfd, length, rl_cookie);
+ else
+ error = shm_dotruncate_locked(shmfd, length, rl_cookie);
VM_OBJECT_WUNLOCK(shmfd->shm_object);
rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
return (error);
@@ -591,7 +780,7 @@
* routines.
*/
struct shmfd *
-shm_alloc(struct ucred *ucred, mode_t mode)
+shm_alloc(struct ucred *ucred, mode_t mode, bool largepage)
{
struct shmfd *shmfd;
@@ -600,8 +789,15 @@
shmfd->shm_uid = ucred->cr_uid;
shmfd->shm_gid = ucred->cr_gid;
shmfd->shm_mode = mode;
- shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL,
- shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
+ if (largepage) {
+ shmfd->shm_object = phys_pager_allocate(NULL,
+ &shm_largepage_phys_ops, shmfd, shmfd->shm_size,
+ VM_PROT_DEFAULT, 0, ucred);
+ shmfd->shm_lp_alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT;
+ } else {
+ shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL,
+ shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
+ }
KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
vfs_timestamp(&shmfd->shm_birthtime);
shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
@@ -752,8 +948,9 @@
Fnv32_t fnv;
mode_t cmode;
int error, fd, initial_seals;
+ bool largepage;
- if ((shmflags & ~SHM_ALLOW_SEALING) != 0)
+ if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_LARGEPAGE)) != 0)
return (EINVAL);
initial_seals = F_SEAL_SEAL;
@@ -777,6 +974,8 @@
if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
return (EINVAL);
+ largepage = (shmflags & SHM_LARGEPAGE) != 0;
+
/*
* Currently only F_SEAL_SEAL may be set when creating or opening shmfd.
* If the decision is made later to allow additional seals, care must be
@@ -810,7 +1009,7 @@
fdrop(fp, td);
return (EINVAL);
}
- shmfd = shm_alloc(td->td_ucred, cmode);
+ shmfd = shm_alloc(td->td_ucred, cmode, largepage);
shmfd->shm_seals = initial_seals;
} else {
error = shm_copyin_path(td, userpath, &path);
@@ -832,7 +1031,8 @@
path);
if (error == 0) {
#endif
- shmfd = shm_alloc(td->td_ucred, cmode);
+ shmfd = shm_alloc(td->td_ucred, cmode,
+ largepage);
shmfd->shm_seals = initial_seals;
shm_insert(path, fnv, shmfd);
#ifdef MAC
@@ -1114,7 +1314,95 @@
return (error);
}
-int
+static int
+shm_mmap_large(struct shmfd *shmfd, vm_map_t map, vm_offset_t *addr,
+ vm_size_t size, vm_prot_t prot, vm_prot_t max_prot, int flags,
+ vm_ooffset_t foff, bool writecounted, struct thread *td)
+{
+ struct vmspace *vms;
+ vm_map_entry_t next_entry, prev_entry;
+ vm_offset_t mask, maxaddr;
+ int docow, error, rv, try;
+ bool curmap;
+
+ if (shmfd->shm_lp_psind == 0)
+ return (EINVAL);
+
+ vms = td->td_proc->p_vmspace;
+ curmap = map == &vms->vm_map;
+ if (curmap) {
+ error = kern_mmap_racct_check(td, map, size);
+ if (error != 0)
+ return (error);
+ }
+
+ docow = shmfd->shm_lp_psind << MAP_SPLIT_BOUNDARY_SHIFT;
+ if ((flags & MAP_NOCORE) != 0)
+ docow |= MAP_DISABLE_COREDUMP;
+ if ((flags & MAP_SHARED) != 0)
+ docow |= MAP_INHERIT_SHARE;
+ if (writecounted)
+ docow |= MAP_WRITECOUNT;
+
+ mask = pagesizes[shmfd->shm_lp_psind] - 1;
+ if ((foff & mask) != 0)
+ return (EINVAL);
+ maxaddr = vm_map_max(map);
+#ifdef MAP_32BIT
+ if ((flags & MAP_32BIT) != 0 && maxaddr > MAP_32BIT_MAX_ADDR)
+ maxaddr = MAP_32BIT_MAX_ADDR;
+#endif
+ if (size == 0 || (size & mask) != 0 ||
+ (*addr != 0 && ((*addr & mask) != 0 ||
+ *addr + size < *addr || *addr + size > maxaddr)))
+ return (EINVAL);
+
+ vm_map_lock(map);
+ if ((flags & MAP_FIXED) == 0) {
+ try = 1;
+ if (curmap && (*addr == 0 ||
+ (*addr >= round_page((vm_offset_t)vms->vm_taddr) &&
+ *addr < round_page((vm_offset_t)vms->vm_daddr +
+ lim_max(td, RLIMIT_DATA))))) {
+ *addr = roundup2((vm_offset_t)vms->vm_daddr +
+ lim_max(td, RLIMIT_DATA),
+ pagesizes[shmfd->shm_lp_psind]);
+ }
+again:
+ rv = vm_map_find_aligned(map, addr, size, maxaddr,
+ pagesizes[shmfd->shm_lp_psind]);
+ if (rv != KERN_SUCCESS) {
+ if (try == 1) {
+ try = 2;
+ *addr = vm_map_min(map);
+ if ((*addr & mask) != 0)
+ *addr = (*addr + mask) & mask;
+ goto again;
+ }
+ error = vm_mmap_to_errno(rv);
+ goto fail;
+ }
+ } else if ((flags & MAP_EXCL) == 0) {
+ vm_map_delete(map, *addr, *addr + size);
+ } else {
+ error = ENOSPC;
+ if (vm_map_lookup_entry(map, *addr, &prev_entry))
+ goto fail;
+ next_entry = vm_map_entry_succ(prev_entry);
+ if (next_entry->start < *addr + size)
+ goto fail;
+ }
+
+ rv = vm_map_insert(map, shmfd->shm_object, foff, *addr, *addr + size,
+ prot, max_prot, docow);
+ if (rv != KERN_SUCCESS)
+ error = vm_mmap_to_errno(rv);
+fail:
+ vm_map_unlock(map);
+ return (error);
+}
+
+static int
shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize,
vm_prot_t prot, vm_prot_t cap_maxprot, int flags,
vm_ooffset_t foff, struct thread *td)
@@ -1186,8 +1474,15 @@
if (writecnt)
vm_pager_update_writecount(shmfd->shm_object, 0, objsize);
- error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags,
- shmfd->shm_object, foff, writecnt, td);
+ if (shm_largepage(shmfd)) {
+ error = shm_mmap_large(shmfd, map, addr, objsize, prot,
+ maxprot, flags, foff, writecnt, td);
+ } else if ((flags & MAP_LARGEPAGE) != 0) {
+ error = EINVAL;
+ } else {
+ error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags,
+ shmfd->shm_object, foff, writecnt, td);
+ }
if (error != 0) {
if (writecnt)
vm_pager_release_writecount(shmfd->shm_object, 0,
Index: sys/modules/hugetlb/Makefile
===================================================================
--- /dev/null
+++ sys/modules/hugetlb/Makefile
@@ -0,0 +1,8 @@
+# $FreeBSD$
+
+.PATH: ${SRCTOP}/sys/dev/hugetlb
+
+KMOD= hugetlb
+SRCS= hugetlb.c
+
+.include <bsd.kmod.mk>
Index: sys/sys/filio.h
===================================================================
--- sys/sys/filio.h
+++ sys/sys/filio.h
@@ -70,6 +70,7 @@
};
/* Get the file's bmap info for the logical block bn. */
#define FIOBMAP2 _IOWR('f', 99, struct fiobmap2_arg)
+#define FIOSHMLPGCNF _IOW('f', 100, struct shm_largepage_conf)
#ifdef _KERNEL
#ifdef COMPAT_FREEBSD32
Index: sys/sys/mman.h
===================================================================
--- sys/sys/mman.h
+++ sys/sys/mman.h
@@ -107,6 +107,8 @@
#ifdef __LP64__
#define MAP_32BIT 0x00080000 /* map in the low 2GB of address space */
#endif
+#define MAP_LARGEPAGE 0x00100000 /* ensure that mapping uses large TLB
+ entries */
/*
* Request specific alignment (n == log2 of the desired alignment).
@@ -190,6 +192,17 @@
* shmflags for shm_open2()
*/
#define SHM_ALLOW_SEALING 0x00000001
+#define SHM_LARGEPAGE 0x00000002
+
+#define SHM_LARGEPAGE_ALLOC_DEFAULT 0
+#define SHM_LARGEPAGE_ALLOC_NOWAIT 1
+#define SHM_LARGEPAGE_ALLOC_HARD 2
+
+struct shm_largepage_conf {
+ int psind;
+ int alloc_policy;
+ int pad[10];
+};
/*
* Flags for memfd_create().
@@ -279,6 +292,10 @@
struct mtx shm_mtx;
int shm_seals;
+
+ /* largepage config */
+ int shm_lp_psind;
+ int shm_lp_alloc_policy;
};
#endif
@@ -287,12 +304,15 @@
int shm_unmap(struct file *fp, void *mem, size_t size);
int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags);
-struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode);
+struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode, bool largepage);
struct shmfd *shm_hold(struct shmfd *shmfd);
void shm_drop(struct shmfd *shmfd);
int shm_dotruncate(struct shmfd *shmfd, off_t length);
extern struct fileops shm_ops;
+
+#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31)
+
#else /* !_KERNEL */
__BEGIN_DECLS
Index: sys/sys/syscallsubr.h
===================================================================
--- sys/sys/syscallsubr.h
+++ sys/sys/syscallsubr.h
@@ -62,6 +62,7 @@
struct stat;
struct thr_param;
struct uio;
+struct vm_map;
typedef int (*mmap_check_fp_fn)(struct file *, int, int, int);
@@ -195,8 +196,10 @@
size_t len);
int kern_mmap(struct thread *td, uintptr_t addr, size_t len, int prot,
int flags, int fd, off_t pos);
-int kern_mmap_req(struct thread *td, const struct mmap_req *mrp);
+int kern_mmap_racct_check(struct thread *td, struct vm_map *map,
+ vm_size_t size);
int kern_mmap_maxprot(struct proc *p, int prot);
+int kern_mmap_req(struct thread *td, const struct mmap_req *mrp);
int kern_mprotect(struct thread *td, uintptr_t addr, size_t size, int prot);
int kern_msgctl(struct thread *, int, int, struct msqid_ds *);
int kern_msgrcv(struct thread *, int, void *, size_t, long, int, long *);
Index: sys/vm/phys_pager.c
===================================================================
--- sys/vm/phys_pager.c
+++ sys/vm/phys_pager.c
@@ -51,6 +51,20 @@
/* protect access to phys_pager_object_list */
static struct mtx phys_pager_mtx;
+static int default_phys_pager_getpages(vm_object_t object, vm_page_t *m,
+ int count, int *rbehind, int *rahead);
+static int default_phys_pager_populate(vm_object_t object, vm_pindex_t pidx,
+ int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last);
+static boolean_t default_phys_pager_haspage(vm_object_t object,
+ vm_pindex_t pindex, int *before, int *after);
+struct phys_pager_ops default_phys_pg_ops = {
+ .phys_pg_getpages = default_phys_pager_getpages,
+ .phys_pg_populate = default_phys_pager_populate,
+ .phys_pg_haspage = default_phys_pager_haspage,
+ .phys_pg_ctor = NULL,
+ .phys_pg_dtor = NULL,
+};
+
static void
phys_pager_init(void)
{
@@ -59,12 +73,13 @@
mtx_init(&phys_pager_mtx, "phys_pager list", NULL, MTX_DEF);
}
-static vm_object_t
-phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
- vm_ooffset_t foff, struct ucred *cred)
+vm_object_t
+phys_pager_allocate(void *handle, struct phys_pager_ops *ops, void *data,
+ vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred)
{
vm_object_t object, object1;
vm_pindex_t pindex;
+ bool init;
/*
* Offset should be page aligned.
@@ -73,6 +88,7 @@
return (NULL);
pindex = OFF_TO_IDX(foff + PAGE_MASK + size);
+ init = true;
if (handle != NULL) {
mtx_lock(&phys_pager_mtx);
@@ -97,11 +113,15 @@
*/
if (pindex > object->size)
object->size = pindex;
+ init = false;
} else {
object = object1;
object1 = NULL;
object->handle = handle;
- vm_object_set_flag(object, OBJ_POPULATE);
+ object->un_pager.phys.ops = ops;
+ object->un_pager.phys.data = data;
+ if (ops->phys_pg_populate != NULL)
+ vm_object_set_flag(object, OBJ_POPULATE);
TAILQ_INSERT_TAIL(&phys_pager_object_list,
object, pager_object_list);
}
@@ -113,12 +133,25 @@
vm_object_deallocate(object1);
} else {
object = vm_object_allocate(OBJT_PHYS, pindex);
- vm_object_set_flag(object, OBJ_POPULATE);
+ object->un_pager.phys.ops = ops;
+ object->un_pager.phys.data = data;
+ if (ops->phys_pg_populate != NULL)
+ vm_object_set_flag(object, OBJ_POPULATE);
}
+ if (init && ops->phys_pg_ctor != NULL)
+ ops->phys_pg_ctor(object, prot, foff, cred);
return (object);
}
+static vm_object_t
+phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
+ vm_ooffset_t foff, struct ucred *ucred)
+{
+ return (phys_pager_allocate(handle, &default_phys_pg_ops, NULL,
+ size, prot, foff, ucred));
+}
+
static void
phys_pager_dealloc(vm_object_t object)
{
@@ -130,16 +163,18 @@
mtx_unlock(&phys_pager_mtx);
VM_OBJECT_WLOCK(object);
}
- object->handle = NULL;
object->type = OBJT_DEAD;
+ if (object->un_pager.phys.ops->phys_pg_dtor != NULL)
+ object->un_pager.phys.ops->phys_pg_dtor(object);
+ object->handle = NULL;
}
/*
* Fill as many pages as vm_fault has allocated for us.
*/
static int
-phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
- int *rahead)
+default_phys_pager_getpages(vm_object_t object, vm_page_t *m, int count,
+ int *rbehind, int *rahead)
{
int i;
@@ -161,6 +196,14 @@
return (VM_PAGER_OK);
}
+static int
+phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
+ int *rahead)
+{
+ return (object->un_pager.phys.ops->phys_pg_getpages(object, m,
+ count, rbehind, rahead));
+}
+
/*
* Implement a pretty aggressive clustered getpages strategy. Hint that
* everything in an entire 4MB window should be prefaulted at once.
@@ -185,7 +228,7 @@
#define PHYSALLOC 16
static int
-phys_pager_populate(vm_object_t object, vm_pindex_t pidx,
+default_phys_pager_populate(vm_object_t object, vm_pindex_t pidx,
int fault_type __unused, vm_prot_t max_prot __unused, vm_pindex_t *first,
vm_pindex_t *last)
{
@@ -216,6 +259,14 @@
return (VM_PAGER_OK);
}
+static int
+phys_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type,
+ vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
+{
+ return (object->un_pager.phys.ops->phys_pg_populate(object, pidx,
+ fault_type, max_prot, first, last));
+}
+
static void
phys_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync,
int *rtvals)
@@ -225,7 +276,7 @@
}
static boolean_t
-phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+default_phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
int *after)
{
vm_pindex_t base, end;
@@ -239,6 +290,14 @@
return (TRUE);
}
+static boolean_t
+phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+ int *after)
+{
+ return (object->un_pager.phys.ops->phys_pg_haspage(object, pindex,
+ before, after));
+}
+
struct pagerops physpagerops = {
.pgo_init = phys_pager_init,
.pgo_alloc = phys_pager_alloc,
Index: sys/vm/pmap.h
===================================================================
--- sys/vm/pmap.h
+++ sys/vm/pmap.h
@@ -106,6 +106,7 @@
*/
#define PMAP_ENTER_NOSLEEP 0x00000100
#define PMAP_ENTER_WIRED 0x00000200
+#define PMAP_ENTER_LARGEPAGE 0x00000400
#define PMAP_ENTER_RESERVED 0xFF000000
/*
@@ -171,5 +172,8 @@
#define pmap_resident_count(pm) ((pm)->pm_stats.resident_count)
#define pmap_wired_count(pm) ((pm)->pm_stats.wired_count)
+extern u_long pmap_superpagesize[];
+extern u_int pmap_superpagesize_nitems;
+
#endif /* _KERNEL */
#endif /* _PMAP_VM_ */
Index: sys/vm/vm_fault.c
===================================================================
--- sys/vm/vm_fault.c
+++ sys/vm/vm_fault.c
@@ -424,7 +424,7 @@
vm_offset_t vaddr;
vm_page_t m;
vm_pindex_t map_first, map_last, pager_first, pager_last, pidx;
- int i, npages, psind, rv;
+ int bdry_idx, i, npages, psind, rv;
MPASS(fs->object == fs->first_object);
VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
@@ -469,15 +469,41 @@
MPASS(pager_last < fs->first_object->size);
vm_fault_restore_map_lock(fs);
+ bdry_idx = (fs->entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
+ MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
if (fs->map->timestamp != fs->map_generation) {
- vm_fault_populate_cleanup(fs->first_object, pager_first,
- pager_last);
+ if (bdry_idx == 0) {
+ vm_fault_populate_cleanup(fs->first_object, pager_first,
+ pager_last);
+ } else {
+ m = vm_page_lookup(fs->first_object, pager_first);
+ vm_page_xunbusy(m);
+ }
return (KERN_RESTART);
}
/*
* The map is unchanged after our last unlock. Process the fault.
*
+ * First, the special case of largepage mappings, where
+ * populate only busies the first page in superpage run.
+ */
+ if (bdry_idx != 0) {
+ m = vm_page_lookup(fs->first_object, pager_first);
+ vm_fault_populate_check_page(m);
+ /* XXX assert alignment for entry */
+ VM_OBJECT_WUNLOCK(fs->first_object);
+ vaddr = fs->entry->start + IDX_TO_OFF(pager_first) -
+ fs->entry->offset;
+ rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot,
+ fs->fault_type | PMAP_ENTER_LARGEPAGE, bdry_idx);
+ MPASS(rv == KERN_SUCCESS);
+ VM_OBJECT_WLOCK(fs->first_object);
+ vm_page_xunbusy(m);
+ goto out;
+ }
+
+ /*
* The range [pager_first, pager_last] that is given to the
* pager is only a hint. The pager may populate any range
* within the object that includes the requested page index.
@@ -543,6 +569,7 @@
vm_page_xunbusy(&m[i]);
}
}
+out:
curthread->td_ru.ru_majflt++;
return (KERN_SUCCESS);
}
Index: sys/vm/vm_map.h
===================================================================
--- sys/vm/vm_map.h
+++ sys/vm/vm_map.h
@@ -149,6 +149,10 @@
#define MAP_ENTRY_STACK_GAP_UP 0x00040000
#define MAP_ENTRY_HEADER 0x00080000
+#define MAP_ENTRY_SPLIT_BOUNDARY_MASK 0x00300000
+
+#define MAP_ENTRY_SPLIT_BOUNDARY_SHIFT 20
+
#ifdef _KERNEL
static __inline u_char
vm_map_entry_behavior(vm_map_entry_t entry)
@@ -363,6 +367,9 @@
#define MAP_CREATE_STACK_GAP_UP 0x00010000
#define MAP_CREATE_STACK_GAP_DN 0x00020000
#define MAP_VN_EXEC 0x00040000
+#define MAP_SPLIT_BOUNDARY_MASK 0x00180000
+
+#define MAP_SPLIT_BOUNDARY_SHIFT 19
/*
* vm_fault option flags
@@ -451,6 +458,8 @@
vm_offset_t, int, vm_prot_t, vm_prot_t, int);
int vm_map_find_min(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *,
vm_size_t, vm_offset_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int);
+int vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
+ vm_offset_t max_addr, vm_offset_t alignment);
int vm_map_fixed(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_size_t,
vm_prot_t, vm_prot_t, int);
vm_offset_t vm_map_findspace(vm_map_t, vm_offset_t, vm_size_t);
Index: sys/vm/vm_map.c
===================================================================
--- sys/vm/vm_map.c
+++ sys/vm/vm_map.c
@@ -1603,13 +1603,17 @@
struct ucred *cred;
vm_eflags_t protoeflags;
vm_inherit_t inheritance;
+ u_long bdry;
+ u_int bidx;
VM_MAP_ASSERT_LOCKED(map);
KASSERT(object != kernel_object ||
(cow & MAP_COPY_ON_WRITE) == 0,
("vm_map_insert: kernel object and COW"));
- KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
- ("vm_map_insert: paradoxical MAP_NOFAULT request"));
+ KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 ||
+ (cow & MAP_SPLIT_BOUNDARY_MASK) != 0,
+ ("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x",
+ object, cow));
KASSERT((prot & ~max) == 0,
("prot %#x is not subset of max_prot %#x", prot, max));
@@ -1665,6 +1669,17 @@
inheritance = VM_INHERIT_SHARE;
else
inheritance = VM_INHERIT_DEFAULT;
+ if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) {
+ /* This magically ignores index 0, for usual page size. */
+ bidx = (cow & MAP_SPLIT_BOUNDARY_MASK) >>
+ MAP_SPLIT_BOUNDARY_SHIFT;
+ if (bidx >= MAXPAGESIZES)
+ return (KERN_INVALID_ARGUMENT);
+ bdry = pagesizes[bidx] - 1;
+ if ((start & bdry) != 0 || (end & bdry) != 0)
+ return (KERN_INVALID_ARGUMENT);
+ protoeflags |= bidx << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
+ }
cred = NULL;
if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
@@ -1959,8 +1974,6 @@
&aslr_restarts, 0,
"Number of aslr failures");
-#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31)
-
/*
* Searches for the specified amount of free space in the given map with the
* specified alignment. Performs an address-ordered, first-fit search from
@@ -2028,6 +2041,19 @@
}
}
+int
+vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
+ vm_offset_t max_addr, vm_offset_t alignment)
+{
+ /* XXXKIB ASLR eh ? */
+ *addr = vm_map_findspace(map, *addr, length);
+ if (*addr + length > vm_map_max(map) ||
+ (max_addr != 0 && *addr + length > max_addr))
+ return (KERN_NO_SPACE);
+ return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr,
+ alignment));
+}
+
/*
* vm_map_find finds an unallocated region in the target address
* map with the given length. The search is defined to be
@@ -2370,19 +2396,6 @@
return (new_entry);
}
-/*
- * vm_map_clip_start: [ internal use only ]
- *
- * Asserts that the given entry begins at or after
- * the specified address; if necessary,
- * it splits the entry into two.
- */
-#define vm_map_clip_start(map, entry, startaddr) \
-{ \
- if (startaddr > entry->start) \
- _vm_map_clip_start(map, entry, startaddr); \
-}
-
/*
* This routine is called only when it is known that
* the entry must be split.
@@ -2406,6 +2419,30 @@
vm_map_entry_link(map, new_entry);
}
+/*
+ * vm_map_clip_start: [ internal use only ]
+ *
+ * Asserts that the given entry begins at or after
+ * the specified address; if necessary,
+ * it splits the entry into two.
+ */
+static inline int
+vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr)
+{
+ int bdry_idx;
+
+ if (startaddr <= entry->start)
+ return (KERN_SUCCESS);
+ bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
+ MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
+ if (bdry_idx != 0) {
+ if ((startaddr & (pagesizes[bdry_idx] - 1)) != 0)
+ return (KERN_INVALID_ARGUMENT);
+ }
+ _vm_map_clip_start(map, entry, startaddr);
+ return (KERN_SUCCESS);
+}
+
/*
* vm_map_lookup_clip_start:
*
@@ -2413,32 +2450,23 @@
* the interior of the entry. Return entry after 'start', and in
* prev_entry set the entry before 'start'.
*/
-static inline vm_map_entry_t
+static inline int
vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start,
- vm_map_entry_t *prev_entry)
+ vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry)
{
vm_map_entry_t entry;
+ int rv;
if (vm_map_lookup_entry(map, start, prev_entry)) {
entry = *prev_entry;
- vm_map_clip_start(map, entry, start);
+ rv = vm_map_clip_start(map, entry, start);
+ if (rv != KERN_SUCCESS)
+ return (rv);
*prev_entry = vm_map_entry_pred(entry);
} else
entry = vm_map_entry_succ(*prev_entry);
- return (entry);
-}
-
-/*
- * vm_map_clip_end: [ internal use only ]
- *
- * Asserts that the given entry ends at or before
- * the specified address; if necessary,
- * it splits the entry into two.
- */
-#define vm_map_clip_end(map, entry, endaddr) \
-{ \
- if ((endaddr) < (entry->end)) \
- _vm_map_clip_end((map), (entry), (endaddr)); \
+ *res_entry = entry;
+ return (KERN_SUCCESS);
}
/*
@@ -2464,6 +2492,30 @@
vm_map_entry_link(map, new_entry);
}
+/*
+ * vm_map_clip_end: [ internal use only ]
+ *
+ * Asserts that the given entry ends at or before
+ * the specified address; if necessary,
+ * it splits the entry into two.
+ */
+static inline int
+vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr)
+{
+ int bdry_idx;
+
+ if (endaddr >= entry->end)
+ return (KERN_SUCCESS);
+ bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
+ MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
+ if (bdry_idx != 0) {
+ if ((endaddr & (pagesizes[bdry_idx] - 1)) != 0)
+ return (KERN_INVALID_ARGUMENT);
+ }
+ _vm_map_clip_end(map, entry, endaddr);
+ return (KERN_SUCCESS);
+}
+
/*
* vm_map_submap: [ kernel use only ]
*
@@ -2503,12 +2555,17 @@
if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end &&
(entry->eflags & MAP_ENTRY_COW) == 0 &&
entry->object.vm_object == NULL) {
- vm_map_clip_start(map, entry, start);
- vm_map_clip_end(map, entry, end);
+ result = vm_map_clip_start(map, entry, start);
+ if (result != KERN_SUCCESS)
+ goto unlock;
+ result = vm_map_clip_end(map, entry, end);
+ if (result != KERN_SUCCESS)
+ goto unlock;
entry->object.sub_map = submap;
entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
result = KERN_SUCCESS;
}
+unlock:
vm_map_unlock(map);
if (result != KERN_SUCCESS) {
@@ -2695,11 +2752,18 @@
* of this loop early and let the next loop simplify the entries, since
* some may now be mergeable.
*/
- rv = KERN_SUCCESS;
- vm_map_clip_start(map, first_entry, start);
+ rv = vm_map_clip_start(map, first_entry, start);
+ if (rv != KERN_SUCCESS) {
+ vm_map_unlock(map);
+ return (rv);
+ }
for (entry = first_entry; entry->start < end;
entry = vm_map_entry_succ(entry)) {
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS) {
+ vm_map_unlock(map);
+ return (rv);
+ }
if (set_max ||
((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 ||
@@ -2819,6 +2883,7 @@
int behav)
{
vm_map_entry_t entry, prev_entry;
+ int rv;
bool modify_map;
/*
@@ -2864,13 +2929,22 @@
* We clip the vm_map_entry so that behavioral changes are
* limited to the specified address range.
*/
- for (entry = vm_map_lookup_clip_start(map, start, &prev_entry);
- entry->start < end;
- prev_entry = entry, entry = vm_map_entry_succ(entry)) {
+ rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry);
+ if (rv != KERN_SUCCESS) {
+ vm_map_unlock(map);
+ return (vm_mmap_to_errno(rv));
+ }
+
+ for (; entry->start < end; prev_entry = entry,
+ entry = vm_map_entry_succ(entry)) {
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
continue;
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS) {
+ vm_map_unlock(map);
+ return (vm_mmap_to_errno(rv));
+ }
switch (behav) {
case MADV_NORMAL:
@@ -3005,6 +3079,7 @@
vm_inherit_t new_inheritance)
{
vm_map_entry_t entry, prev_entry;
+ int rv;
switch (new_inheritance) {
case VM_INHERIT_NONE:
@@ -3015,14 +3090,19 @@
default:
return (KERN_INVALID_ARGUMENT);
}
+ rv = KERN_SUCCESS;
if (start == end)
- return (KERN_SUCCESS);
+ return (rv);
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
- for (entry = vm_map_lookup_clip_start(map, start, &prev_entry);
- entry->start < end;
- prev_entry = entry, entry = vm_map_entry_succ(entry)) {
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry);
+ if (rv != KERN_SUCCESS)
+ goto unlock;
+ for (; entry->start < end; prev_entry = entry,
+ entry = vm_map_entry_succ(entry)) {
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS)
+ goto unlock;
if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
new_inheritance != VM_INHERIT_ZERO)
entry->inheritance = new_inheritance;
@@ -3030,7 +3110,8 @@
}
vm_map_try_merge_entries(map, prev_entry, entry);
vm_map_unlock(map);
- return (KERN_SUCCESS);
+unlock:
+ return (rv);
}
/*
@@ -3129,8 +3210,13 @@
next_entry : NULL;
continue;
}
- vm_map_clip_start(map, entry, start);
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_clip_start(map, entry, start);
+ if (rv != KERN_SUCCESS)
+ break;
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS)
+ break;
+
/*
* Mark the entry in case the map lock is released. (See
* above.)
@@ -3337,8 +3423,13 @@
next_entry : NULL;
continue;
}
- vm_map_clip_start(map, entry, start);
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_clip_start(map, entry, start);
+ if (rv != KERN_SUCCESS)
+ break;
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS)
+ break;
+
/*
* Mark the entry in case the map lock is released. (See
* above.)
@@ -3743,18 +3834,22 @@
int
vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
{
- vm_map_entry_t entry, next_entry;
+ vm_map_entry_t entry, next_entry, scratch_entry;
+ int rv;
VM_MAP_ASSERT_LOCKED(map);
+ rv = KERN_SUCCESS;
if (start == end)
- return (KERN_SUCCESS);
+ return (rv);
/*
* Find the start of the region, and clip it.
* Step through all entries in this region.
*/
- for (entry = vm_map_lookup_clip_start(map, start, &entry);
- entry->start < end; entry = next_entry) {
+ rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry);
+ if (rv != KERN_SUCCESS)
+ return (rv);
+ for (; entry->start < end; entry = next_entry) {
/*
* Wait for wiring or unwiring of an entry to complete.
* Also wait for any system wirings to disappear on
@@ -3778,13 +3873,19 @@
* Specifically, the entry may have been
* clipped, merged, or deleted.
*/
- next_entry = vm_map_lookup_clip_start(map,
- saved_start, &next_entry);
+ rv = vm_map_lookup_clip_start(map, saved_start,
+ &next_entry, &scratch_entry);
+ if (rv != KERN_SUCCESS)
+ break;
} else
next_entry = entry;
continue;
}
- vm_map_clip_end(map, entry, end);
+
+ /* XXXKIB or delete to the upper superpage boundary ? */
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS)
+ break;
next_entry = vm_map_entry_succ(entry);
/*
@@ -3814,7 +3915,7 @@
*/
vm_map_entry_delete(map, entry);
}
- return (KERN_SUCCESS);
+ return (rv);
}
/*
Index: sys/vm/vm_mmap.c
===================================================================
--- sys/vm/vm_mmap.c
+++ sys/vm/vm_mmap.c
@@ -285,7 +285,7 @@
}
if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
- MAP_PREFAULT_READ | MAP_GUARD |
+ MAP_PREFAULT_READ | MAP_GUARD | MAP_LARGEPAGE |
#ifdef MAP_32BIT
MAP_32BIT |
#endif
@@ -305,6 +305,10 @@
#endif
MAP_ALIGNMENT_MASK)) != 0))
return (EINVAL);
+ if ((flags & MAP_LARGEPAGE) != 0 && (flags & ~(MAP_LARGEPAGE |
+ MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_EXCL | MAP_NOCORE |
+ MAP_32BIT)) != 0)
+ return (EINVAL);
/*
* Align the file position to a page boundary,
@@ -368,10 +372,10 @@
* There should really be a pmap call to determine a reasonable
* location.
*/
- if (addr == 0 ||
+ if ((flags & MAP_LARGEPAGE) == 0 && (addr == 0 ||
(addr >= round_page((vm_offset_t)vms->vm_taddr) &&
addr < round_page((vm_offset_t)vms->vm_daddr +
- lim_max(td, RLIMIT_DATA))))
+ lim_max(td, RLIMIT_DATA)))))
addr = round_page((vm_offset_t)vms->vm_daddr +
lim_max(td, RLIMIT_DATA));
}
@@ -418,6 +422,10 @@
error = EINVAL;
goto done;
}
+ if ((flags & MAP_LARGEPAGE) != 0 && fp->f_ops != &shm_ops) {
+ error = EINVAL;
+ goto done;
+ }
if (check_fp_fn != NULL) {
error = check_fp_fn(fp, prot, max_prot & cap_maxprot,
flags);
@@ -1511,6 +1519,39 @@
return (error);
}
+int
+kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size)
+{
+ int error;
+
+ RACCT_PROC_LOCK(td->td_proc);
+ if (map->size + size > lim_cur(td, RLIMIT_VMEM)) {
+ RACCT_PROC_UNLOCK(td->td_proc);
+ return (ENOMEM);
+ }
+ if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
+ RACCT_PROC_UNLOCK(td->td_proc);
+ return (ENOMEM);
+ }
+ if (!old_mlock && map->flags & MAP_WIREFUTURE) {
+ if (ptoa(pmap_wired_count(map->pmap)) + size >
+ lim_cur(td, RLIMIT_MEMLOCK)) {
+ racct_set_force(td->td_proc, RACCT_VMEM, map->size);
+ RACCT_PROC_UNLOCK(td->td_proc);
+ return (ENOMEM);
+ }
+ error = racct_set(td->td_proc, RACCT_MEMLOCK,
+ ptoa(pmap_wired_count(map->pmap)) + size);
+ if (error != 0) {
+ racct_set_force(td->td_proc, RACCT_VMEM, map->size);
+ RACCT_PROC_UNLOCK(td->td_proc);
+ return (error);
+ }
+ }
+ RACCT_PROC_UNLOCK(td->td_proc);
+ return (0);
+}
+
/*
* Internal version of mmap that maps a specific VM object into an
* map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
@@ -1520,39 +1561,15 @@
vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
boolean_t writecounted, struct thread *td)
{
- boolean_t curmap, fitit;
vm_offset_t max_addr;
int docow, error, findspace, rv;
+ bool curmap, fitit;
curmap = map == &td->td_proc->p_vmspace->vm_map;
if (curmap) {
- RACCT_PROC_LOCK(td->td_proc);
- if (map->size + size > lim_cur(td, RLIMIT_VMEM)) {
- RACCT_PROC_UNLOCK(td->td_proc);
- return (ENOMEM);
- }
- if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
- RACCT_PROC_UNLOCK(td->td_proc);
- return (ENOMEM);
- }
- if (!old_mlock && map->flags & MAP_WIREFUTURE) {
- if (ptoa(pmap_wired_count(map->pmap)) + size >
- lim_cur(td, RLIMIT_MEMLOCK)) {
- racct_set_force(td->td_proc, RACCT_VMEM,
- map->size);
- RACCT_PROC_UNLOCK(td->td_proc);
- return (ENOMEM);
- }
- error = racct_set(td->td_proc, RACCT_MEMLOCK,
- ptoa(pmap_wired_count(map->pmap)) + size);
- if (error != 0) {
- racct_set_force(td->td_proc, RACCT_VMEM,
- map->size);
- RACCT_PROC_UNLOCK(td->td_proc);
- return (error);
- }
- }
- RACCT_PROC_UNLOCK(td->td_proc);
+ error = kern_mmap_racct_check(td, map, size);
+ if (error != 0)
+ return (error);
}
/*
Index: sys/vm/vm_object.h
===================================================================
--- sys/vm/vm_object.h
+++ sys/vm/vm_object.h
@@ -173,6 +173,14 @@
struct pctrie swp_blks;
vm_ooffset_t writemappings;
} swp;
+
+ /*
+ * Phys pager
+ */
+ struct {
+ struct phys_pager_ops *ops;
+ void *data;
+ } phys;
} un_pager;
struct ucred *cred;
vm_ooffset_t charge;
Index: sys/vm/vm_object.c
===================================================================
--- sys/vm/vm_object.c
+++ sys/vm/vm_object.c
@@ -289,6 +289,7 @@
kernel_object->flags |= OBJ_COLORED;
kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
#endif
+ kernel_object->un_pager.phys.ops = &default_phys_pg_ops;
/*
* The lock portion of struct vm_object must be type stable due
Index: sys/vm/vm_pager.h
===================================================================
--- sys/vm/vm_pager.h
+++ sys/vm/vm_pager.h
@@ -229,5 +229,22 @@
vm_object_t cdev_pager_lookup(void *handle);
void cdev_pager_free_page(vm_object_t object, vm_page_t m);
+struct phys_pager_ops {
+ int (*phys_pg_getpages)(vm_object_t vm_obj, vm_page_t *m, int count,
+ int *rbehind, int *rahead);
+ int (*phys_pg_populate)(vm_object_t vm_obj, vm_pindex_t pidx,
+ int fault_type, vm_prot_t max_prot, vm_pindex_t *first,
+ vm_pindex_t *last);
+ boolean_t (*phys_pg_haspage)(vm_object_t obj, vm_pindex_t pindex,
+ int *before, int *after);
+ void (*phys_pg_ctor)(vm_object_t vm_obj, vm_prot_t prot,
+ vm_ooffset_t foff, struct ucred *cred);
+ void (*phys_pg_dtor)(vm_object_t vm_obj);
+};
+extern struct phys_pager_ops default_phys_pg_ops;
+vm_object_t phys_pager_allocate(void *handle, struct phys_pager_ops *ops,
+ void *data, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff,
+ struct ucred *cred);
+
#endif /* _KERNEL */
#endif /* _VM_PAGER_ */
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Dec 28, 12:39 AM (13 h, 3 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27317639
Default Alt Text
D24652.id71372.diff (61 KB)
Attached To
Mode
D24652: Non-transparent superpages support.
Attached
Detach File
Event Timeline
Log In to Comment