Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F153059591
D24652.id71243.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
31 KB
Referenced Files
None
Subscribers
None
D24652.id71243.diff
View Options
Index: TODO
===================================================================
--- /dev/null
+++ TODO
@@ -0,0 +1,6 @@
+- sysctl for total allocated superpages memory
+- per-user limit on the total superpages allocations
+- posix shm API
+- handle 1G PG_PS in other places of pmap
+- make pmap_superpagesizes[] per-pmap ?
+- more test programs
\ No newline at end of file
Index: include/Makefile
===================================================================
--- include/Makefile
+++ include/Makefile
@@ -43,7 +43,7 @@
LSUBDIRS= cam/ata cam/mmc cam/nvme cam/scsi \
dev/acpica dev/agp dev/an dev/ciss dev/filemon dev/firewire \
- dev/hwpmc dev/hyperv \
+ dev/hugetlb dev/hwpmc dev/hyperv \
dev/ic dev/iicbus dev/io dev/mfi dev/mmc dev/nvme \
dev/ofw dev/pbio dev/pci ${_dev_powermac_nvram} dev/ppbus dev/pwm \
dev/smbus dev/speaker dev/tcp_log dev/veriexec dev/vkbd dev/wi \
Index: sys/amd64/amd64/pmap.c
===================================================================
--- sys/amd64/amd64/pmap.c
+++ sys/amd64/amd64/pmap.c
@@ -322,6 +322,9 @@
#define PV_STAT(x) do { } while (0)
#endif
+u_long pmap_superpagesize[] = { NBPDR, NBPDP };
+u_int pmap_superpagesize_nitems;
+
#undef pa_index
#define pa_index(pa) ({ \
KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end, \
@@ -2190,6 +2193,12 @@
VM_PAGE_TO_PHYS(m);
}
}
+
+ if (pg_ps_enabled) {
+ pmap_superpagesize_nitems++;
+ if ((amd_feature & AMDID_PAGE1GB) != 0)
+ pmap_superpagesize_nitems++;
+ }
}
SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries,
@@ -3780,6 +3789,19 @@
* one or two pages may be held during the wait, only to be released
* afterwards. This conservative approach is easily argued to avoid
* race conditions.
+ *
+ * Page table entry at address va page index is defined as follows:
+ * - for page table (last level), ptepindex = pmap_pde_pindex(va) =
+ * = va >> PDRSHIFT, in other words, it is just the index of the PDE.
+ * - for page directory page, ptepindex = NUPDE (number of userland PD
+ * entries) + (pmap_pde_index(va) >> NPDEPGSHIFT)
+ * i.e. index of PDPE is put after the last index of PDE,
+ * - for page directory pointer page, ptepindex = NUPDE + NUPDPE +
+ * (pmap_pde_index(va) >> (NPDEPGSHIFT + NPML4EPGSHIFT),
+ * i.e. index of pml4e is put after the last index of PDPE.
+ * In other words, is it sequential number of the corresponding paging entry
+ * in the order where all entries of the same rank are put together, then
+ * ranks are put from deepest to root.
*/
static vm_page_t
_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
@@ -5395,6 +5417,7 @@
pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
struct rwlock *lock;
+ vm_page_t mt;
vm_offset_t va_next;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
@@ -5447,13 +5470,28 @@
}
pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
+ va_next = (sva + NBPDP) & ~PDPMASK;
if ((*pdpe & PG_V) == 0) {
- va_next = (sva + NBPDP) & ~PDPMASK;
if (va_next < sva)
va_next = eva;
continue;
}
+ KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva,
+ ("pmap_remove of non-transient 1G page "
+ "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
+ *pdpe, sva, eva, va_next));
+ if ((*pdpe & PG_PS) != 0) {
+ MPASS(pmap != kernel_pmap); /* XXXKIB */
+ MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0);
+ anyvalid = 1;
+ *pdpe = 0;
+ pmap_resident_count_dec(pmap, NBPDP / PAGE_SIZE);
+ mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME);
+ pmap_unwire_ptp(pmap, sva, mt, &free);
+ continue;
+ }
+
/*
* Calculate index for next page table.
*/
@@ -5669,11 +5707,13 @@
void
pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
{
+ vm_page_t m;
vm_offset_t va_next;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t ptpaddr, *pde;
pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
+ pt_entry_t obits, pbits;
boolean_t anychanged;
KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
@@ -5724,13 +5764,44 @@
}
pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
+ va_next = (sva + NBPDP) & ~PDPMASK;
if ((*pdpe & PG_V) == 0) {
- va_next = (sva + NBPDP) & ~PDPMASK;
if (va_next < sva)
va_next = eva;
continue;
}
+ KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva,
+ ("pmap_remove of non-transient 1G page "
+ "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
+ *pdpe, sva, eva, va_next));
+ if ((*pdpe & PG_PS) != 0) {
+retry_pdpe:
+ /*
+ * Must not change protection lazily, we do
+ * not handle page faults on 1G superpages.
+ */
+ obits = pbits = *pdpe;
+ MPASS((pbits & (PG_MANAGED | PG_G)) == 0);
+ MPASS(pmap != kernel_pmap); /* XXXKIB */
+ if ((prot & VM_PROT_WRITE) == 0)
+ pbits &= ~(PG_RW | PG_M);
+ else
+ pbits |= PG_RW | PG_M;
+ if ((prot & VM_PROT_EXECUTE) == 0)
+ pbits |= pg_nx;
+ else
+ pbits &= ~pg_nx;
+
+ if (pbits != obits) {
+ if (!atomic_cmpset_long(pdpe, obits, pbits))
+ /* PG_PS cannot be cleared under us, */
+ goto retry_pdpe;
+ anychanged = TRUE;
+ }
+ continue;
+ }
+
va_next = (sva + NBPDR) & ~PDRMASK;
if (va_next < sva)
va_next = eva;
@@ -5773,9 +5844,6 @@
for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
sva += PAGE_SIZE) {
- pt_entry_t obits, pbits;
- vm_page_t m;
-
retry:
obits = pbits = *pte;
if ((pbits & PG_V) == 0)
@@ -10613,6 +10681,102 @@
NULL, 0, sysctl_kmaps, "A",
"Dump kernel address layout");
+int
+pmap_enter_hugetlb(pmap_t pmap, u_int sp_index, vm_page_t m, vm_offset_t va,
+ vm_prot_t prot)
+{
+ vm_page_t mp;
+ pt_entry_t *pml4e, *pdpe, *pde;
+ pt_entry_t newpte, PG_G, PG_A, PG_M, PG_RW, PG_V;
+ vm_pindex_t ptepindex;
+
+ KASSERT(sp_index < pmap_superpagesize_nitems, ("XXX"));
+ KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("XXX"));
+ KASSERT((VM_PAGE_TO_PHYS(m) & (pmap_superpagesize[sp_index] - 1)) == 0,
+ ("XXX"));
+ KASSERT((va & (pmap_superpagesize[sp_index] - 1)) == 0,
+ ("XXX"));
+ KASSERT(va < VM_MAXUSER_ADDRESS, ("XXX")); /* XXXKIB */
+ KASSERT(va + pmap_superpagesize[sp_index] < VM_MAXUSER_ADDRESS,
+ ("XXX")); /* XXXKIB */
+
+ PG_A = pmap_accessed_bit(pmap);
+ PG_G = pmap_global_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_V = pmap_valid_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+
+restart:
+ newpte = VM_PAGE_TO_PHYS(m) | PG_A | PG_PS | PG_V;
+ if ((prot & VM_PROT_WRITE) != 0)
+ newpte |= PG_RW | PG_M;
+ if ((prot & VM_PROT_EXECUTE) == 0)
+ newpte |= pg_nx;
+ if (pmap == kernel_pmap)
+ newpte |= PG_G;
+ PMAP_LOCK(pmap);
+ if (va < VM_MAXUSER_ADDRESS) {
+ newpte |= PG_U;
+ if (pmap->pm_type == PT_X86)
+ newpte |= pmap_pkru_get(pmap, va);
+ }
+ newpte |= pmap_cache_bits(pmap, m->md.pat_mode, true);
+
+ ptepindex = pmap_pde_pindex(va);
+
+ if (sp_index == 1) { /* 1G */
+ pml4e = pmap_pml4e(pmap, va);
+ if ((*pml4e & PG_V) == 0) {
+ mp = _pmap_allocpte(pmap, NUPDE + NUPDPE +
+ ((ptepindex - NUPDE) >> NPML4EPGSHIFT), NULL);
+ if (mp == NULL) {
+ PMAP_UNLOCK(pmap);
+ vm_wait(NULL);
+
+ /*
+ * Restart at least to recalcuate the pkru
+ * key. Our caller must keep the map locked
+ * so no paging structure can be validated
+ * under us.
+ */
+ goto restart;
+ }
+ } else {
+ mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME);
+ mp->ref_count++;
+ }
+ pdpe = pmap_pdpe(pmap, va);
+ KASSERT(pdpe != NULL, ("XXX"));
+ KASSERT((*pdpe & PG_V) == 0, ("XXX"));
+ *pdpe = newpte;
+ } else /* (sp_index == 0) */ { /* 2M */
+ pde = pmap_pde(pmap, va);
+ if (pde == NULL) {
+ mp = _pmap_allocpte(pmap, NUPDE +
+ (ptepindex >> NPDPEPGSHIFT), NULL);
+ if (mp == NULL) {
+ PMAP_UNLOCK(pmap);
+ vm_wait(NULL);
+ goto restart;
+ }
+ pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
+ pde = &pde[pmap_pde_index(va)];
+ } else {
+ pdpe = pmap_pdpe(pmap, va);
+ MPASS(pdpe != NULL && (*pdpe & PG_V) != 0);
+ mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
+ mp->ref_count++;
+ }
+ KASSERT(pde != NULL, ("XXX"));
+ KASSERT((*pde & PG_V) == 0, ("XXX"));
+ *pde = newpte;
+ }
+ pmap_resident_count_inc(pmap, pmap_superpagesize[sp_index] / PAGE_SIZE);
+ PMAP_UNLOCK(pmap);
+
+ return (KERN_SUCCESS);
+}
+
#ifdef DDB
DB_SHOW_COMMAND(pte, pmap_print_pte)
{
Index: sys/dev/hugetlb/hugetlb.h
===================================================================
--- /dev/null
+++ sys/dev/hugetlb/hugetlb.h
@@ -0,0 +1,36 @@
+/**
+ *
+ */
+
+#ifndef _SYS_DEV_HUGETLB_H
+#define _SYS_DEV_HUGETLB_H
+
+#include <sys/types.h>
+#include <sys/ioccom.h>
+
+#define _PATH_DEVHUGETLB "/dev/hugetlb"
+
+struct hugetlb_sizes {
+ u_int sizes_nitems;
+ u_int *sizes;
+};
+
+struct hugetlb_mmap {
+ int domain;
+ u_int superpage_index;
+ u_int flags;
+ u_int prot;
+ size_t size;
+ void *addr;
+};
+
+#define HUGETLB_MMAP_FIXED 0x0001
+#define HUGETLB_MMAP_EXCL 0x0002
+#define HUGETLB_MMAP_CORE 0x0004
+#define HUGETLB_MMAP_NOWAIT 0x0008
+#define HUGETLB_MMAP_WAITHARD 0x0010
+
+#define HUGETLB_SIZES _IOWR('H', 1, struct hugetlb_sizes)
+#define HUGETLB_MMAP _IOWR('H', 2, struct hugetlb_mmap)
+
+#endif
Index: sys/dev/hugetlb/hugetlb.c
===================================================================
--- /dev/null
+++ sys/dev/hugetlb/hugetlb.c
@@ -0,0 +1,374 @@
+/**
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/abi_compat.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/syscallsubr.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
+
+#include <dev/hugetlb/hugetlb.h>
+
+static int
+hugetlb_sizes_handler(struct thread *td, struct hugetlb_sizes *hs)
+{
+ u_int i;
+ int error;
+
+ error = 0;
+ if (hs->sizes != NULL) {
+ for (i = 0; i < pmap_superpagesize_nitems &&
+ i < hs->sizes_nitems; i++) {
+ error = copyout(&pmap_superpagesize[i], hs->sizes + i,
+ sizeof(u_int));
+ if (error != 0)
+ break;
+ }
+ }
+ hs->sizes_nitems = pmap_superpagesize_nitems;
+ return (error);
+}
+
+static int
+hugetlb_mmap_alloc_obj(struct thread *td, vm_object_t *objp, u_long sp_size,
+ u_int sp_index, vm_offset_t size, int domain, u_int flags)
+{
+ vm_object_t obj;
+ vm_page_t m;
+ vm_offset_t a;
+ u_int aflags, i;
+ int error;
+
+ obj = vm_pager_allocate(OBJT_PHYS, NULL, OFF_TO_IDX(size), VM_PROT_ALL,
+ 0, td->td_ucred);
+
+ aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO;
+ if ((flags & (HUGETLB_MMAP_NOWAIT | HUGETLB_MMAP_WAITHARD)) == 0)
+ aflags |= VM_ALLOC_WAITFAIL;
+ a = 0;
+restart:
+ VM_OBJECT_WLOCK(obj);
+ for (; a < size; a += sp_size) {
+ if (domain == -1) {
+ m = vm_page_alloc_contig(obj, a / PAGE_SIZE,
+ aflags,
+ sp_size / PAGE_SIZE, 0, ~0,
+ sp_size, 0, VM_MEMATTR_DEFAULT);
+ } else {
+ m = vm_page_alloc_contig_domain(obj, a / PAGE_SIZE,
+ domain, aflags,
+ sp_size / PAGE_SIZE, 0, ~0,
+ sp_size, 0, VM_MEMATTR_DEFAULT);
+ }
+ if (m == NULL) {
+ VM_OBJECT_WUNLOCK(obj);
+ if ((flags & HUGETLB_MMAP_NOWAIT) != 0) {
+ error = ENOMEM;
+ goto fail;
+ }
+ if ((flags & HUGETLB_MMAP_WAITHARD) != 0) {
+ if (domain == -1) {
+ if (!vm_page_reclaim_contig(aflags,
+ sp_size / PAGE_SIZE,
+ 0, ~0, sp_size, 0))
+ /* XXXKIB */
+ vm_wait_domain(domain);
+ } else {
+ if (!vm_page_reclaim_contig_domain(
+ domain, aflags,
+ sp_size / PAGE_SIZE,
+ 0, ~0, sp_size, 0))
+ vm_wait_domain(domain);
+ }
+ } else {
+ vm_wait(obj);
+ }
+ error = thread_check_susp(td, false);
+ if (error != 0)
+ goto fail;
+ goto restart;
+ }
+ for (i = 0; i < sp_size / PAGE_SIZE; i++) {
+ if ((m[i].flags & PG_ZERO) == 0)
+ pmap_zero_page(&m[i]);
+ vm_page_valid(&m[i]);
+ vm_page_xunbusy(&m[i]);
+ }
+ }
+ VM_OBJECT_WUNLOCK(obj);
+
+ *objp = obj;
+ return (0);
+
+fail:
+ vm_object_deallocate(obj);
+ return (error);
+}
+
+static int
+hugetlb_mmap_handler(struct thread *td, struct hugetlb_mmap *hm)
+{
+ vm_map_t map;
+ pmap_t pmap;
+ vm_object_t obj;
+ vm_map_entry_t next_entry, prev_entry;
+ vm_page_t m;
+ vm_offset_t addr, sp_mask;
+ vm_pindex_t pi;
+ int error, max_prot, prot, rv, try;
+
+ map = &td->td_proc->p_vmspace->vm_map;
+ pmap = vmspace_pmap(td->td_proc->p_vmspace);
+
+ if ((hm->flags & ~(HUGETLB_MMAP_FIXED | HUGETLB_MMAP_CORE |
+ HUGETLB_MMAP_NOWAIT | HUGETLB_MMAP_WAITHARD)) != 0 ||
+ (hm->flags & (HUGETLB_MMAP_NOWAIT | HUGETLB_MMAP_WAITHARD)) ==
+ (HUGETLB_MMAP_NOWAIT | HUGETLB_MMAP_WAITHARD) ||
+ hm->superpage_index >= pmap_superpagesize_nitems ||
+ (hm->domain != -1 && (hm->domain < 0 || hm->domain >= vm_ndomains)))
+ return (EINVAL);
+ sp_mask = pmap_superpagesize[hm->superpage_index] - 1;
+ addr = (vm_offset_t)hm->addr;
+ if (hm->size == 0 || (hm->size & sp_mask) != 0 ||
+ (addr != 0 && (addr & sp_mask) != 0) ||
+ addr + hm->size < addr || addr + hm->size > VM_MAXUSER_ADDRESS)
+ return (EINVAL);
+ if ((hm->prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0)
+ return (EINVAL);
+ max_prot = PROT_MAX_EXTRACT(hm->prot);
+ prot = PROT_EXTRACT(hm->prot);
+ if (max_prot != 0 && (max_prot & prot) != prot)
+ return (ENOTSUP);
+ if (prot == 0)
+ return (EINVAL);
+ if (max_prot == 0)
+ max_prot = kern_mmap_maxprot(td->td_proc, prot);
+
+ /* XXXKIB limit checks ? */
+
+ error = hugetlb_mmap_alloc_obj(td, &obj,
+ pmap_superpagesize[hm->superpage_index],
+ hm->superpage_index, hm->size, hm->domain, hm->flags);
+ if (error != 0)
+ return (error);
+
+ vm_map_lock(map);
+ if ((hm->flags & HUGETLB_MMAP_FIXED) == 0) {
+ try = 1;
+ if (addr == 0) {
+ addr = roundup2((vm_offset_t)td->td_proc->p_vmspace->
+ vm_daddr + lim_max(td, RLIMIT_DATA),
+ pmap_superpagesize[hm->superpage_index]);
+ }
+again:
+ rv = vm_map_find_aligned(map, &addr, hm->size, vm_map_max(map),
+ pmap_superpagesize[hm->superpage_index]);
+ if (rv != KERN_SUCCESS) {
+ if (try == 1) {
+ try = 2;
+ addr = vm_map_min(map);
+ if ((addr & sp_mask) != 0)
+ addr = (addr + sp_mask) & sp_mask;
+ goto again;
+ }
+ goto fail;
+ }
+ } else if ((hm->flags & HUGETLB_MMAP_EXCL) == 0) {
+ vm_map_delete(map, addr, addr + hm->size);
+ } else {
+ rv = KERN_NO_SPACE;
+ if (vm_map_lookup_entry(map, addr, &prev_entry))
+ goto fail;
+ next_entry = vm_map_entry_succ(prev_entry);
+ if (next_entry->start < addr + hm->size)
+ goto fail;
+ }
+
+ /*
+ * Insert the mapping into pmap before creating the map entry.
+ * If buggy userspace accesses the allocated region before we
+ * populated page tables, fault must not instantiate any pte.
+ */
+ for (pi = 0; pi < OFF_TO_IDX(hm->size);
+ pi += OFF_TO_IDX(pmap_superpagesize[hm->superpage_index])) {
+ VM_OBJECT_RLOCK(obj);
+ m = vm_page_lookup(obj, pi);
+ VM_OBJECT_RUNLOCK(obj);
+ MPASS(m != NULL);
+ rv = pmap_enter_hugetlb(pmap, hm->superpage_index, m,
+ addr + IDX_TO_OFF(pi), prot);
+ if (rv != KERN_SUCCESS)
+ goto fail1;
+ }
+
+ rv = vm_map_insert(map, obj, 0, addr, addr + hm->size, prot,
+ max_prot, MAP_INHERIT_SHARE |
+ ((hm->superpage_index + 1) << MAP_SPLIT_BOUNDARY_SHIFT) |
+ ((hm->flags & HUGETLB_MMAP_CORE) != 0 ? 0 : MAP_DISABLE_COREDUMP));
+ if (rv != KERN_SUCCESS)
+ goto fail1;
+ vm_map_unlock(map);
+ hm->addr = (void *)addr;
+ return (0);
+
+fail1:
+ pmap_remove(pmap, addr, addr + hm->size);
+fail:
+ vm_map_unlock(map);
+ vm_object_deallocate(obj);
+ return (vm_mmap_to_errno(rv));
+}
+
+#ifdef COMPAT_FREEBSD32
+struct hugetlb_mmap32 {
+ int domain;
+ u_int superpage_index;
+ u_int flags;
+ u_int prot;
+ uint32_t size;
+ uint32_t addr;
+};
+
+#define HUGETLB_MMAP32 _IOWR('H', 2, struct hugetlb_mmap32)
+#endif
+
+static int
+hugetlb_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data,
+ int fflag, struct thread *td)
+{
+#ifdef COMPAT_FREEBSD32
+ struct hugetlb_mmap32 *hm32;
+ struct hugetlb_mmap hm;
+#endif
+ int error;
+
+ switch (cmd) {
+ case HUGETLB_SIZES:
+ error = hugetlb_sizes_handler(td, (struct hugetlb_sizes *)data);
+ break;
+ case HUGETLB_MMAP:
+ if ((fflag & FWRITE) == 0)
+ return (EPERM);
+ error = hugetlb_mmap_handler(td, (struct hugetlb_mmap *)data);
+ break;
+#ifdef COMPAT_FREEBSD32
+ case HUGETLB_MMAP32:
+ if ((fflag & FWRITE) == 0)
+ return (EPERM);
+ hm32 = (struct hugetlb_mmap32 *)data;
+ CP(*hm32, hm, domain);
+ CP(*hm32, hm, superpage_index);
+ CP(*hm32, hm, flags);
+ CP(*hm32, hm, prot);
+ CP(*hm32, hm, size);
+ PTRIN_CP(*hm32, hm, addr);
+ error = hugetlb_mmap_handler(td, &hm);
+ if (error == 0)
+ PTROUT_CP(hm, *hm32, addr);
+ break;
+#endif
+ default:
+ error = ENOTTY;
+ break;
+ }
+ return (error);
+}
+
+static struct cdevsw hugetlb_cdevsw = {
+ .d_version = D_VERSION,
+ .d_ioctl = hugetlb_ioctl,
+};
+static struct cdev *hugetlb_cdev;
+
+static int
+hugetlb_init(void)
+{
+ struct make_dev_args mda;
+ u_int i;
+ int error;
+
+ if (pmap_superpagesize_nitems > 0) {
+ if (bootverbose) {
+ printf("hugetlb device: %u pagesizes:",
+ pmap_superpagesize_nitems);
+ for (i = 0; i < pmap_superpagesize_nitems; i++)
+ printf(" %#lx", pmap_superpagesize[i]);
+ printf("\n");
+ }
+ } else {
+ printf("hugetlb: superpages are not supported\n");
+ return (ENOTTY);
+ }
+
+ make_dev_args_init(&mda);
+ mda.mda_devsw = &hugetlb_cdevsw;
+ mda.mda_uid = UID_ROOT;
+ mda.mda_gid = GID_WHEEL;
+ mda.mda_mode = 0644;
+ error = make_dev_s(&mda, &hugetlb_cdev, "hugetlb");
+ if (error != 0) {
+ printf("could not create /dev/hugetlb, error %d\n", error);
+ return (error);
+ }
+
+ return (0);
+}
+
+static void
+hugetlb_fini(void)
+{
+ if (hugetlb_cdev != NULL)
+ destroy_dev(hugetlb_cdev);
+}
+
+static int
+hugetlb_modload(struct module *module, int cmd, void *arg __unused)
+{
+ int error;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ error = hugetlb_init();
+ if (error != 0)
+ hugetlb_fini();
+ break;
+ case MOD_UNLOAD:
+ hugetlb_fini();
+ error = 0;
+ break;
+ default:
+ error = 0;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t hugetlb_mod = {
+ "hugetlb",
+ hugetlb_modload,
+};
+DECLARE_MODULE(hugetlb, hugetlb_mod, SI_SUB_VFS, SI_ORDER_MIDDLE);
+MODULE_VERSION(hugetlb, 1);
Index: sys/modules/Makefile
===================================================================
--- sys/modules/Makefile
+++ sys/modules/Makefile
@@ -137,6 +137,7 @@
hwpmc \
${_hwpmc_mips24k} \
${_hwpmc_mips74k} \
+ ${_hugetlb} \
${_hyperv} \
i2c \
${_iavf} \
@@ -697,6 +698,7 @@
_amdgpio= amdgpio
_ccp= ccp
_efirt= efirt
+_hugetlb= hugetlb
_iavf= iavf
_ioat= ioat
_ixl= ixl
Index: sys/modules/hugetlb/Makefile
===================================================================
--- /dev/null
+++ sys/modules/hugetlb/Makefile
@@ -0,0 +1,8 @@
+# $FreeBSD$
+
+.PATH: ${SRCTOP}/sys/dev/hugetlb
+
+KMOD= hugetlb
+SRCS= hugetlb.c
+
+.include <bsd.kmod.mk>
Index: sys/vm/pmap.h
===================================================================
--- sys/vm/pmap.h
+++ sys/vm/pmap.h
@@ -130,6 +130,8 @@
vm_page_t mb[], vm_offset_t b_offset, int xfersize);
int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
vm_prot_t prot, u_int flags, int8_t psind);
+int pmap_enter_hugetlb(pmap_t pmap, u_int sp_index, vm_page_t m,
+ vm_offset_t va, vm_prot_t prot);
void pmap_enter_object(pmap_t pmap, vm_offset_t start,
vm_offset_t end, vm_page_t m_start, vm_prot_t prot);
void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m,
@@ -171,5 +173,8 @@
#define pmap_resident_count(pm) ((pm)->pm_stats.resident_count)
#define pmap_wired_count(pm) ((pm)->pm_stats.wired_count)
+extern u_long pmap_superpagesize[];
+extern u_int pmap_superpagesize_nitems;
+
#endif /* _KERNEL */
#endif /* _PMAP_VM_ */
Index: sys/vm/vm_fault.c
===================================================================
--- sys/vm/vm_fault.c
+++ sys/vm/vm_fault.c
@@ -763,6 +763,12 @@
return (result);
}
+ if ((fs->entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) != 0) {
+ vm_map_unlock_read(fs->map);
+ unlock_vp(fs);
+ return (KERN_PROTECTION_FAILURE);
+ }
+
fs->map_generation = fs->map->timestamp;
if (fs->entry->eflags & MAP_ENTRY_NOFAULT) {
Index: sys/vm/vm_map.h
===================================================================
--- sys/vm/vm_map.h
+++ sys/vm/vm_map.h
@@ -149,6 +149,10 @@
#define MAP_ENTRY_STACK_GAP_UP 0x00040000
#define MAP_ENTRY_HEADER 0x00080000
+#define MAP_ENTRY_SPLIT_BOUNDARY_MASK 0x00300000
+
+#define MAP_ENTRY_SPLIT_BOUNDARY_SHIFT 20
+
#ifdef _KERNEL
static __inline u_char
vm_map_entry_behavior(vm_map_entry_t entry)
@@ -363,6 +367,9 @@
#define MAP_CREATE_STACK_GAP_UP 0x00010000
#define MAP_CREATE_STACK_GAP_DN 0x00020000
#define MAP_VN_EXEC 0x00040000
+#define MAP_SPLIT_BOUNDARY_MASK 0x00180000
+
+#define MAP_SPLIT_BOUNDARY_SHIFT 19
/*
* vm_fault option flags
@@ -451,6 +458,8 @@
vm_offset_t, int, vm_prot_t, vm_prot_t, int);
int vm_map_find_min(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *,
vm_size_t, vm_offset_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int);
+int vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
+ vm_offset_t max_addr, vm_offset_t alignment);
int vm_map_fixed(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_size_t,
vm_prot_t, vm_prot_t, int);
vm_offset_t vm_map_findspace(vm_map_t, vm_offset_t, vm_size_t);
Index: sys/vm/vm_map.c
===================================================================
--- sys/vm/vm_map.c
+++ sys/vm/vm_map.c
@@ -1603,13 +1603,17 @@
struct ucred *cred;
vm_eflags_t protoeflags;
vm_inherit_t inheritance;
+ u_long bdry;
+ u_int bidx;
VM_MAP_ASSERT_LOCKED(map);
KASSERT(object != kernel_object ||
(cow & MAP_COPY_ON_WRITE) == 0,
("vm_map_insert: kernel object and COW"));
- KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
- ("vm_map_insert: paradoxical MAP_NOFAULT request"));
+ KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 ||
+ (cow & MAP_SPLIT_BOUNDARY_MASK) != 0,
+ ("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x",
+ object, cow));
KASSERT((prot & ~max) == 0,
("prot %#x is not subset of max_prot %#x", prot, max));
@@ -1665,6 +1669,16 @@
inheritance = VM_INHERIT_SHARE;
else
inheritance = VM_INHERIT_DEFAULT;
+ if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) {
+ bidx = ((cow & MAP_SPLIT_BOUNDARY_MASK) >>
+ MAP_SPLIT_BOUNDARY_SHIFT) - 1;
+ if (bidx >= pmap_superpagesize_nitems)
+ return (KERN_INVALID_ARGUMENT);
+ bdry = pmap_superpagesize[bidx] - 1;
+ if ((start & bdry) != 0 || (end & bdry) != 0)
+ return (KERN_INVALID_ARGUMENT);
+ protoeflags |= (bidx + 1) << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
+ }
cred = NULL;
if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
@@ -2028,6 +2042,19 @@
}
}
+int
+vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
+ vm_offset_t max_addr, vm_offset_t alignment)
+{
+ /* XXXKIB ASLR eh ? */
+ *addr = vm_map_findspace(map, *addr, length);
+ if (*addr + length > vm_map_max(map) ||
+ (max_addr != 0 && *addr + length > max_addr))
+ return (KERN_NO_SPACE);
+ return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr,
+ alignment));
+}
+
/*
* vm_map_find finds an unallocated region in the target address
* map with the given length. The search is defined to be
@@ -2370,19 +2397,6 @@
return (new_entry);
}
-/*
- * vm_map_clip_start: [ internal use only ]
- *
- * Asserts that the given entry begins at or after
- * the specified address; if necessary,
- * it splits the entry into two.
- */
-#define vm_map_clip_start(map, entry, startaddr) \
-{ \
- if (startaddr > entry->start) \
- _vm_map_clip_start(map, entry, startaddr); \
-}
-
/*
* This routine is called only when it is known that
* the entry must be split.
@@ -2406,6 +2420,31 @@
vm_map_entry_link(map, new_entry);
}
+/*
+ * vm_map_clip_start: [ internal use only ]
+ *
+ * Asserts that the given entry begins at or after
+ * the specified address; if necessary,
+ * it splits the entry into two.
+ */
+static inline int
+vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr)
+{
+ int bdry_idx;
+
+ if (startaddr <= entry->start)
+ return (KERN_SUCCESS);
+ bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
+ MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
+ if (bdry_idx != 0) {
+ bdry_idx--;
+ if ((startaddr & (pmap_superpagesize[bdry_idx] - 1)) != 0)
+ return (KERN_INVALID_ARGUMENT);
+ }
+ _vm_map_clip_start(map, entry, startaddr);
+ return (KERN_SUCCESS);
+}
+
/*
* vm_map_lookup_clip_start:
*
@@ -2413,32 +2452,23 @@
* the interior of the entry. Return entry after 'start', and in
* prev_entry set the entry before 'start'.
*/
-static inline vm_map_entry_t
+static inline int
vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start,
- vm_map_entry_t *prev_entry)
+ vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry)
{
vm_map_entry_t entry;
+ int rv;
if (vm_map_lookup_entry(map, start, prev_entry)) {
entry = *prev_entry;
- vm_map_clip_start(map, entry, start);
+ rv = vm_map_clip_start(map, entry, start);
+ if (rv != KERN_SUCCESS)
+ return (rv);
*prev_entry = vm_map_entry_pred(entry);
} else
entry = vm_map_entry_succ(*prev_entry);
- return (entry);
-}
-
-/*
- * vm_map_clip_end: [ internal use only ]
- *
- * Asserts that the given entry ends at or before
- * the specified address; if necessary,
- * it splits the entry into two.
- */
-#define vm_map_clip_end(map, entry, endaddr) \
-{ \
- if ((endaddr) < (entry->end)) \
- _vm_map_clip_end((map), (entry), (endaddr)); \
+ *res_entry = entry;
+ return (KERN_SUCCESS);
}
/*
@@ -2464,6 +2494,31 @@
vm_map_entry_link(map, new_entry);
}
+/*
+ * vm_map_clip_end: [ internal use only ]
+ *
+ * Asserts that the given entry ends at or before
+ * the specified address; if necessary,
+ * it splits the entry into two.
+ */
+static inline int
+vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr)
+{
+ int bdry_idx;
+
+ if (endaddr >= entry->end)
+ return (KERN_SUCCESS);
+ bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
+ MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
+ if (bdry_idx != 0) {
+ bdry_idx--;
+ if ((endaddr & (pmap_superpagesize[bdry_idx] - 1)) != 0)
+ return (KERN_INVALID_ARGUMENT);
+ }
+ _vm_map_clip_end(map, entry, endaddr);
+ return (KERN_SUCCESS);
+}
+
/*
* vm_map_submap: [ kernel use only ]
*
@@ -2503,12 +2558,17 @@
if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end &&
(entry->eflags & MAP_ENTRY_COW) == 0 &&
entry->object.vm_object == NULL) {
- vm_map_clip_start(map, entry, start);
- vm_map_clip_end(map, entry, end);
+ result = vm_map_clip_start(map, entry, start);
+ if (result != KERN_SUCCESS)
+ goto unlock;
+ result = vm_map_clip_end(map, entry, end);
+ if (result != KERN_SUCCESS)
+ goto unlock;
entry->object.sub_map = submap;
entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
result = KERN_SUCCESS;
}
+unlock:
vm_map_unlock(map);
if (result != KERN_SUCCESS) {
@@ -2695,11 +2755,18 @@
* of this loop early and let the next loop simplify the entries, since
* some may now be mergeable.
*/
- rv = KERN_SUCCESS;
- vm_map_clip_start(map, first_entry, start);
+ rv = vm_map_clip_start(map, first_entry, start);
+ if (rv != KERN_SUCCESS) {
+ vm_map_unlock(map);
+ return (rv);
+ }
for (entry = first_entry; entry->start < end;
entry = vm_map_entry_succ(entry)) {
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS) {
+ vm_map_unlock(map);
+ return (rv);
+ }
if (set_max ||
((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 ||
@@ -2819,6 +2886,7 @@
int behav)
{
vm_map_entry_t entry, prev_entry;
+ int rv;
bool modify_map;
/*
@@ -2864,13 +2932,22 @@
* We clip the vm_map_entry so that behavioral changes are
* limited to the specified address range.
*/
- for (entry = vm_map_lookup_clip_start(map, start, &prev_entry);
- entry->start < end;
- prev_entry = entry, entry = vm_map_entry_succ(entry)) {
+ rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry);
+ if (rv != KERN_SUCCESS) {
+ vm_map_unlock(map);
+ return (vm_mmap_to_errno(rv));
+ }
+
+ for (; entry->start < end; prev_entry = entry,
+ entry = vm_map_entry_succ(entry)) {
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
continue;
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS) {
+ vm_map_unlock(map);
+ return (vm_mmap_to_errno(rv));
+ }
switch (behav) {
case MADV_NORMAL:
@@ -3005,6 +3082,7 @@
vm_inherit_t new_inheritance)
{
vm_map_entry_t entry, prev_entry;
+ int rv;
switch (new_inheritance) {
case VM_INHERIT_NONE:
@@ -3015,14 +3093,19 @@
default:
return (KERN_INVALID_ARGUMENT);
}
+ rv = KERN_SUCCESS;
if (start == end)
- return (KERN_SUCCESS);
+ return (rv);
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
- for (entry = vm_map_lookup_clip_start(map, start, &prev_entry);
- entry->start < end;
- prev_entry = entry, entry = vm_map_entry_succ(entry)) {
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry);
+ if (rv != KERN_SUCCESS)
+ goto unlock;
+ for (; entry->start < end; prev_entry = entry,
+ entry = vm_map_entry_succ(entry)) {
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS)
+ goto unlock;
if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
new_inheritance != VM_INHERIT_ZERO)
entry->inheritance = new_inheritance;
@@ -3030,7 +3113,8 @@
}
vm_map_try_merge_entries(map, prev_entry, entry);
vm_map_unlock(map);
- return (KERN_SUCCESS);
+unlock:
+ return (rv);
}
/*
@@ -3129,8 +3213,13 @@
next_entry : NULL;
continue;
}
- vm_map_clip_start(map, entry, start);
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_clip_start(map, entry, start);
+ if (rv != KERN_SUCCESS)
+ break;
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS)
+ break;
+
/*
* Mark the entry in case the map lock is released. (See
* above.)
@@ -3337,8 +3426,13 @@
next_entry : NULL;
continue;
}
- vm_map_clip_start(map, entry, start);
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_clip_start(map, entry, start);
+ if (rv != KERN_SUCCESS)
+ break;
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS)
+ break;
+
/*
* Mark the entry in case the map lock is released. (See
* above.)
@@ -3743,18 +3837,22 @@
int
vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
{
- vm_map_entry_t entry, next_entry;
+ vm_map_entry_t entry, next_entry, scratch_entry;
+ int rv;
VM_MAP_ASSERT_LOCKED(map);
+ rv = KERN_SUCCESS;
if (start == end)
- return (KERN_SUCCESS);
+ return (rv);
/*
* Find the start of the region, and clip it.
* Step through all entries in this region.
*/
- for (entry = vm_map_lookup_clip_start(map, start, &entry);
- entry->start < end; entry = next_entry) {
+ rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry);
+ if (rv != KERN_SUCCESS)
+ return (rv);
+ for (; entry->start < end; entry = next_entry) {
/*
* Wait for wiring or unwiring of an entry to complete.
* Also wait for any system wirings to disappear on
@@ -3778,13 +3876,19 @@
* Specifically, the entry may have been
* clipped, merged, or deleted.
*/
- next_entry = vm_map_lookup_clip_start(map,
- saved_start, &next_entry);
+ rv = vm_map_lookup_clip_start(map, saved_start,
+ &next_entry, &scratch_entry);
+ if (rv != KERN_SUCCESS)
+ break;
} else
next_entry = entry;
continue;
}
- vm_map_clip_end(map, entry, end);
+
+ /* XXXKIB or delete to the upper superpage boundary ? */
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS)
+ break;
next_entry = vm_map_entry_succ(entry);
/*
@@ -3814,7 +3918,7 @@
*/
vm_map_entry_delete(map, entry);
}
- return (KERN_SUCCESS);
+ return (rv);
}
/*
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Apr 19, 9:47 PM (12 h, 7 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
31801391
Default Alt Text
D24652.id71243.diff (31 KB)
Attached To
Mode
D24652: Non-transparent superpages support.
Attached
Detach File
Event Timeline
Log In to Comment