Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -143,6 +143,7 @@ #include #include #include +#include #include #include @@ -409,6 +410,10 @@ static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ +static vmem_t large_vmem; +static struct mtx lm_lock; +static u_int lm_ents; + int pmap_pcid_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); @@ -651,6 +656,7 @@ static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); +static vm_page_t pmap_large_map_getpage_unlocked(void); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); #if VM_NRESERVLEVEL > 0 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, @@ -1306,7 +1312,7 @@ pmap_init(void) { struct pmap_preinit_mapping *ppim; - vm_page_t mpte; + vm_page_t m, mpte; vm_size_t s; int error, i, pv_npg, ret, skz63; @@ -1433,6 +1439,27 @@ (vmem_addr_t *)&qframe); if (error != 0) panic("qframe allocation failed"); + + lm_ents = 8; + TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); + if (lm_ents > LMEPML4I - LMSPML4I + 1) + lm_ents = LMEPML4I - LMSPML4I + 1; + if (bootverbose) + printf("pmap: large map %u PML4 slots (%lu Gb)\n", + lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); + if (lm_ents != 0 && vmem_init(&large_vmem, "large", + LARGEMAP_MIN_ADDRESS, (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, + M_WAITOK) == NULL) { + printf("pmap: cannot create large map, %d PML4 slots\n", + lm_ents); + lm_ents = 0; + } + for (i = 0; i < lm_ents; i++) { + m = pmap_large_map_getpage_unlocked(); + kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V | X86_PG_RW | + X86_PG_A | X86_PG_M | pg_nx | VM_PAGE_TO_PHYS(m); + } + mtx_init(&lm_lock, "lm", NULL, MTX_DEF); } static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, @@ -2188,8 +2215,8 @@ if ((cpu_feature & CPUID_SS) != 0 && !force) ; /* If "Self Snoop" is supported and allowed, do nothing. */ - else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && - eva - sva < PMAP_CLFLUSH_THRESHOLD) { + else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 /* && + eva - sva < PMAP_CLFLUSH_THRESHOLD XXXKIB wbinvd has wrong semantic */) { /* * XXX: Some CPUs fault, hang, or trash the local APIC * registers if we use CLFLUSH on the local APIC @@ -2210,8 +2237,8 @@ for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); sfence(); - } else if ((cpu_feature & CPUID_CLFSH) != 0 && - eva - sva < PMAP_CLFLUSH_THRESHOLD) { + } else if ((cpu_feature & CPUID_CLFSH) != 0 /* && + eva - sva < PMAP_CLFLUSH_THRESHOLD */) { if (pmap_kextract(sva) == lapic_paddr) return; /* @@ -2274,6 +2301,76 @@ } } +void +pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) +{ + + /* + * Unlike pmap_invalidate_cache_range(), we do not limit the + * size of the flushing region. The preference is to keep the + * cache populated by the cost of the CPU time for the thread + * requesting the flush. + */ + + KASSERT((sva & PAGE_MASK) == 0, + ("pmap_flush_cache_range: sva not page-aligned")); + KASSERT((eva & PAGE_MASK) == 0, + ("pmap_flush_cache_range: eva not page-aligned")); + + if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { + pmap_invalidate_cache_range(sva, eva, TRUE); + return; + } + + if (pmap_kextract(sva) == lapic_paddr) + return; + + sfence(); + for (; sva < eva; sva += cpu_clflush_line_size) { + clwb(sva); + maybe_yield(); + } + sfence(); +} + +void +pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) +{ + pt_entry_t *pte; + vm_offset_t vaddr; + int error, pte_bits; + + KASSERT((spa & PAGE_MASK) == 0, + ("pmap_flush_cache_phys_range: spa not page-aligned")); + KASSERT((epa & PAGE_MASK) == 0, + ("pmap_flush_cache_phys_range: epa not page-aligned")); + + if (spa < dmaplimit) { + pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(spa) + + (dmaplimit > epa ? dmaplimit : epa)); + if (dmaplimit > epa) + return; + spa = dmaplimit; + } + + pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW | + X86_PG_V; + error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, + &vaddr); + KASSERT(error == 0, ("vmem_alloc failed: %d", error)); + pte = vtopte(vaddr); + for (; spa < epa; spa += PAGE_SIZE) { + sched_pin(); + pte_store(pte, spa | pte_bits); + invlpg(vaddr); + /* XXXKIB sfences inside flush_cache_range are excessive */ + pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); + sched_unpin(); + maybe_yield(); + } + vmem_free(kernel_arena, vaddr, PAGE_SIZE); +} + /* * Routine: pmap_extract * Function: @@ -2680,6 +2777,10 @@ /* install self-referential address mapping entry(s) */ pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; + + /* install large map entries if configured */ + for (i = 0; i < lm_ents; i++) + pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i]; } static void @@ -3026,6 +3127,8 @@ for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ pmap->pm_pml4[DMPML4I + i] = 0; pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ + for (i = 0; i < lm_ents; i++) /* Large Map */ + pmap->pm_pml4[LMSPML4I + i] = 0; vm_page_unwire_noq(m); vm_page_free_zero(m); @@ -7970,6 +8073,402 @@ mtx_unlock_spin(&qframe_mtx); } +static vm_page_t +pmap_large_map_getpage_unlocked(void) +{ + vm_page_t m; + + m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + VM_ALLOC_NOBUSY | VM_ALLOC_ZERO); + if (m != NULL && (m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + return (m); +} + +static vm_page_t +pmap_large_map_getpage(void) +{ + vm_page_t m; + + mtx_assert(&lm_lock, MA_OWNED); + for (;;) { + m = pmap_large_map_getpage_unlocked(); + if (m != NULL) + return (m); + mtx_unlock(&lm_lock); + vm_wait(NULL); + mtx_lock(&lm_lock); + } +} + +static pdp_entry_t * +pmap_large_map_pdpe(vm_offset_t va) +{ + pdp_entry_t *pdpe; + vm_pindex_t pml4_idx; + vm_paddr_t mphys; + + pml4_idx = pmap_pml4e_index(va); + KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, + ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " + "%#jx lm_ents %d", + (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); + KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0, + ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " + "LMSPML4I %#jx lm_ents %d", + (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); + mphys = kernel_pmap->pm_pml4[pml4_idx] & ~(PAGE_MASK | pg_nx); + pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); + return (pdpe); +} + +static pd_entry_t * +pmap_large_map_pde(vm_offset_t va) +{ + pdp_entry_t *pdpe; + pd_entry_t *pde; + vm_page_t m; + vm_pindex_t pd_idx; + vm_paddr_t mphys; + + pdpe = pmap_large_map_pdpe(va); + if (*pdpe == 0) { + m = pmap_large_map_getpage(); + mphys = VM_PAGE_TO_PHYS(m); + *pdpe = mphys | X86_PG_RW | X86_PG_V | X86_PG_G | pg_nx; + } else { + MPASS((*pdpe & X86_PG_PS) == 0); + mphys = *pdpe & ~(PAGE_MASK | pg_nx); + } + + pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); + pd_idx = pmap_pde_index(va); + pde += pd_idx; + return (pde); +} + +static pt_entry_t * +pmap_large_map_pte(vm_offset_t va) +{ + pd_entry_t *pde; + pt_entry_t *pte; + vm_page_t m; + vm_paddr_t mphys; + + pde = pmap_large_map_pde(va); + if (*pde == 0) { + m = pmap_large_map_getpage(); + mphys = VM_PAGE_TO_PHYS(m); + *pde = mphys | X86_PG_RW | X86_PG_V | X86_PG_G | pg_nx; + PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->wire_count++; + } else { + MPASS((*pde & X86_PG_PS) == 0); + mphys = *pde & ~(PAGE_MASK | pg_nx); + } + + pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); + pte += pmap_pte_index(va); + + return (pte); +} + +static int +pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase, + vmem_addr_t *vmem_res) +{ + + return (vmem_xalloc(&large_vmem, len, align, phase, 0, VMEM_ADDR_MIN, + VMEM_ADDR_MAX, M_NOWAIT | M_FIRSTFIT, vmem_res)); +} + +int +pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, + vm_memattr_t mattr) +{ + pdp_entry_t *pdpe; + pd_entry_t *pde; + pt_entry_t *pte; + vm_offset_t va, inc; + vmem_addr_t vmem_res; + vm_paddr_t pa; + int error; + + if (spa + len < spa) + return (EINVAL); + + /* See if DMAP can serve. */ + if (spa + len < dmaplimit) { + va = PHYS_TO_DMAP(spa); + pmap_change_attr(va, len, mattr); + *addr = (void *)va; + return (0); + } + + /* + * No, allocate KVA. Fit the address with best possible + * alignment for superpages. Fall back to worse align if + * failed. + */ + error = ENOMEM; + if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len, + NBPDP) >= roundup2(spa, NBPDP) + NBPDP) + error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK, + &vmem_res); + if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa, + NBPDR) + NBPDR) + error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK, + &vmem_res); + if (error != 0) + error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res); + if (error != 0) + return (error); + + /* + * Fill pagetable. PG_M is not pre-set, we scan modified bits + * in the pagetable to minimize flushing. No need to + * invalidate TLB, since we only update invalid entries. + */ + mtx_lock(&lm_lock); + for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc, + len -= inc) { + if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP && + (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) { + pdpe = pmap_large_map_pdpe(va); + MPASS(*pdpe == 0); + *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | + X86_PG_V | X86_PG_A | pg_nx | + pmap_cache_bits(kernel_pmap, mattr, TRUE); + inc = NBPDP; + } else if (len >= NBPDR && (pa & PDRMASK) == 0 && + (va & PDRMASK) == 0) { + pde = pmap_large_map_pde(va); + MPASS(*pde == 0); + *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | + X86_PG_V | X86_PG_A | pg_nx | + pmap_cache_bits(kernel_pmap, mattr, TRUE); + PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> + wire_count++; + inc = NBPDR; + } else { + pte = pmap_large_map_pte(va); + MPASS(*pte == 0); + *pte = pa | pg_g | X86_PG_RW | X86_PG_V | + X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, + mattr, FALSE); + PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> + wire_count++; + inc = PAGE_SIZE; + } + } + mtx_unlock(&lm_lock); + MPASS(len == 0); + + *addr = (void *)vmem_res; + return (0); +} + +void +pmap_large_unmap(void *svaa, vm_size_t len) +{ + vm_offset_t sva, va; + vm_size_t inc; + pdp_entry_t *pdpe, pdp; + pd_entry_t *pde, pd; + pt_entry_t *pte; + vm_page_t m; + struct spglist spgf; + + SLIST_INIT(&spgf); + sva = (vm_offset_t)svaa; + KASSERT(LARGEMAP_MIN_ADDRESS <= sva && + sva < LARGEMAP_MAX_ADDRESS + NBPML4 * (u_long)lm_ents, + ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len)); + mtx_lock(&lm_lock); + for (va = sva; va < sva + len; va += inc) { + pdpe = pmap_large_map_pdpe(va); + pdp = *pdpe; + KASSERT((pdp & X86_PG_V) != 0, + ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, + (u_long)pdpe, pdp)); + if ((pdp & X86_PG_PS) != 0) { + KASSERT((amd_feature & AMDID_PAGE1GB) != 0, + ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, + (u_long)pdpe, pdp)); + KASSERT((va & PDPMASK) == 0, + ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va, + (u_long)pdpe, pdp)); + KASSERT(len <= NBPDP, + ("len < NBPDP, sva %#lx va %#lx pdpe %#lx pdp %#lx " + "len %#lx", sva, va, (u_long)pdpe, pdp, len)); + inc = NBPDP; + *pdpe = 0; + continue; + } + pde = pmap_large_map_pde(va); + pd = *pde; + KASSERT((pd & X86_PG_V) != 0, + ("invalid pd va %#lx pde %#lx pd %#lx", va, + (u_long)pde, pd)); + if ((pd & X86_PG_PS) != 0) { + KASSERT((va & PDRMASK) == 0, + ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va, + (u_long)pde, pd)); + KASSERT(len <= NBPDR, + ("len < NBPDR, sva %#lx va %#lx pde %#lx pd %#lx " + "len %#lx", sva, va, (u_long)pde, pd, len)); + pde_store(pde, 0); + inc = NBPDR; + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); + m->wire_count--; + if (m->wire_count == 0) { + *pdpe = 0; + SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); + } + continue; + } + pte = pmap_large_map_pte(va); + KASSERT((*pte & X86_PG_V) != 0, + ("invalid pte va %#lx pte %#lx pt %#lx", va, + (u_long)pte, *pte)); + pte_clear(pte); + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); + m->wire_count--; + if (m->wire_count == 0) { + *pde = 0; + SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); + m->wire_count--; + if (m->wire_count == 0) { + *pdpe = 0; + SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); + } + } + inc = PAGE_SIZE; + } + mtx_unlock(&lm_lock); + pmap_invalidate_range(kernel_pmap, sva, len); + vm_page_free_pages_toq(&spgf, false); + vmem_free(&large_vmem, sva, len); +} + +static void +pmap_large_map_wb_fence(void) +{ + + if (cpu_vendor_id != CPU_VENDOR_INTEL) + mfence(); + else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | + CPUID_STDEXT_CLFLUSHOPT)) == 0) + /* clflush is strongly enough ordered */ + sfence(); +} + +static void +pmap_large_map_flush_range(vm_offset_t va, vm_size_t len) +{ + + for (; len > 0; len -= cpu_clflush_line_size, + va += cpu_clflush_line_size) { + if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0) + clwb(va); + else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) + clflushopt(va); + else if ((cpu_feature & CPUID_CLFSH) != 0) + clflush(va); + } +} + +/* + * Write-back cache lines for the given address range. + * + * Must be called only on the range or sub-range returned from + * pmap_large_map(). Must not be called on the coalesced ranges. + * + * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH + * instructions support. + */ +void +pmap_large_map_wb(vm_offset_t sva, vm_size_t len) +{ + volatile u_long *pe; + u_long p; + vm_offset_t va, eva; + vm_size_t inc; + bool seen_other; + + KASSERT(sva >= LARGEMAP_MIN_ADDRESS && + sva + len < LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, + ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); + if ((cpu_feature & CPUID_CLFSH) == 0) + return; /* Interface is of no use */ + eva = sva + len; + pmap_large_map_wb_fence(); + for (va = sva; va < eva; va += inc) { + inc = 0; + if ((amd_feature & AMDID_PAGE1GB) != 0) { + pe = (volatile u_long *)pmap_large_map_pdpe(va); + p = *pe; + if ((p & X86_PG_PS) != 0) + inc = NBPDP; + } + if (inc == 0) { + pe = (volatile u_long *)pmap_large_map_pde(va); + p = *pe; + if ((p & X86_PG_PS) != 0) + inc = NBPDR; + } + if (inc == 0) { + pe = (volatile u_long *)pmap_large_map_pte(va); + p = *pe; + inc = PAGE_SIZE; + } + seen_other = false; + for (;;) { + if ((p & X86_PG_AVAIL1) != 0) { + /* + * Spin-wait for the end of a parallel + * write-back. + */ + cpu_spinwait(); + p = *pe; + + /* + * If we saw other write-back + * occuring, we cannot rely on PG_M to + * indicate state of the cache. The + * PG_M bit is cleared before the + * flush to avoid ignoring new writes, + * and writes which are relevant for + * us might happen after. + */ + seen_other = true; + continue; + } + + if ((p & X86_PG_M) != 0 || seen_other) { + if (!atomic_fcmpset_long(pe, &p, + (p & ~X86_PG_M) | X86_PG_AVAIL1)) + /* + * If we saw PG_M without + * PG_AVAIL1, and then on the + * next attempt we do not + * observe neither PG_M nor + * PG_AVAIL1, other write-back + * started after us and + * finished before us. We can + * rely on it doing our work. + */ + continue; + pmap_large_map_flush_range(va, inc); + atomic_clear_long(pe, X86_PG_AVAIL1); + } + break; + } + maybe_yield(); + } + pmap_large_map_wb_fence(); +} + static vm_page_t pmap_pti_alloc_page(void) { Index: sys/amd64/include/cpufunc.h =================================================================== --- sys/amd64/include/cpufunc.h +++ sys/amd64/include/cpufunc.h @@ -115,6 +115,13 @@ __asm __volatile(".byte 0x66;clflush %0" : : "m" (*(char *)addr)); } +static __inline void +clwb(u_long addr) +{ + + __asm __volatile("clwb %0" : : "m" (*(char *)addr)); +} + static __inline void clts(void) { Index: sys/amd64/include/pmap.h =================================================================== --- sys/amd64/include/pmap.h +++ sys/amd64/include/pmap.h @@ -216,6 +216,10 @@ #define KPML4I (NPML4EPG-1) #define KPDPI (NPDPEPG-2) /* kernbase at -2GB */ +/* Large map: index of the first and max last pml4 entry */ +#define LMSPML4I (PML4PML4I + 1) +#define LMEPML4I (DMPML4I - 1) + /* * XXX doesn't really belong here I guess... */ @@ -413,11 +417,16 @@ int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde); int pmap_change_attr(vm_offset_t, vm_size_t, int); void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate); +void pmap_flush_cache_range(vm_offset_t, vm_offset_t); +void pmap_flush_cache_phys_range(vm_paddr_t, vm_paddr_t, vm_memattr_t); void pmap_init_pat(void); void pmap_kenter(vm_offset_t va, vm_paddr_t pa); void *pmap_kenter_temporary(vm_paddr_t pa, int i); vm_paddr_t pmap_kextract(vm_offset_t); void pmap_kremove(vm_offset_t); +int pmap_large_map(vm_paddr_t, vm_size_t, void **, vm_memattr_t); +void pmap_large_map_wb(vm_offset_t sva, vm_size_t len); +void pmap_large_unmap(void *sva, vm_size_t len); void *pmap_mapbios(vm_paddr_t, vm_size_t); void *pmap_mapdev(vm_paddr_t, vm_size_t); void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int); Index: sys/amd64/include/vmparam.h =================================================================== --- sys/amd64/include/vmparam.h +++ sys/amd64/include/vmparam.h @@ -156,7 +156,9 @@ * 0x0000000000000000 - 0x00007fffffffffff user map * 0x0000800000000000 - 0xffff7fffffffffff does not exist (hole) * 0xffff800000000000 - 0xffff804020100fff recursive page table (512GB slot) - * 0xffff804020101000 - 0xfffff7ffffffffff unused + * 0xffff804020100fff - 0xffff807fffffffff unused + * 0xffff808000000000 - 0xffff847fffffffff large map (can be tuned up) + * 0xffff848000000000 - 0xfffff7ffffffffff unused (large map extends there) * 0xfffff80000000000 - 0xfffffbffffffffff 4TB direct map * 0xfffffc0000000000 - 0xfffffdffffffffff unused * 0xfffffe0000000000 - 0xffffffffffffffff 2TB kernel map @@ -173,6 +175,9 @@ #define DMAP_MIN_ADDRESS KVADDR(DMPML4I, 0, 0, 0) #define DMAP_MAX_ADDRESS KVADDR(DMPML4I + NDMPML4E, 0, 0, 0) +#define LARGEMAP_MIN_ADDRESS KVADDR(LMSPML4I, 0, 0, 0) +#define LARGEMAP_MAX_ADDRESS KVADDR(LMEPML4I + 1, 0, 0, 0) + #define KERNBASE KVADDR(KPML4I, KPDPI, 0, 0) #define UPT_MAX_ADDRESS KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I) Index: sys/kern/subr_vmem.c =================================================================== --- sys/kern/subr_vmem.c +++ sys/kern/subr_vmem.c @@ -73,22 +73,13 @@ #include #include #include +#include #include #include #include int vmem_startup_count(void); -#define VMEM_OPTORDER 5 -#define VMEM_OPTVALUE (1 << VMEM_OPTORDER) -#define VMEM_MAXORDER \ - (VMEM_OPTVALUE - 1 + sizeof(vmem_size_t) * NBBY - VMEM_OPTORDER) - -#define VMEM_HASHSIZE_MIN 16 -#define VMEM_HASHSIZE_MAX 131072 - -#define VMEM_QCACHE_IDX_MAX 16 - #define VMEM_FITMASK (M_BESTFIT | M_FIRSTFIT) #define VMEM_FLAGS \ @@ -96,8 +87,6 @@ #define BT_FLAGS (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM) -#define QC_NAME_MAX 16 - /* * Data structures private to vmem. */ @@ -105,59 +94,8 @@ typedef struct vmem_btag bt_t; -TAILQ_HEAD(vmem_seglist, vmem_btag); -LIST_HEAD(vmem_freelist, vmem_btag); -LIST_HEAD(vmem_hashlist, vmem_btag); - -struct qcache { - uma_zone_t qc_cache; - vmem_t *qc_vmem; - vmem_size_t qc_size; - char qc_name[QC_NAME_MAX]; -}; -typedef struct qcache qcache_t; #define QC_POOL_TO_QCACHE(pool) ((qcache_t *)(pool->pr_qcache)) -#define VMEM_NAME_MAX 16 - -/* vmem arena */ -struct vmem { - struct mtx_padalign vm_lock; - struct cv vm_cv; - char vm_name[VMEM_NAME_MAX+1]; - LIST_ENTRY(vmem) vm_alllist; - struct vmem_hashlist vm_hash0[VMEM_HASHSIZE_MIN]; - struct vmem_freelist vm_freelist[VMEM_MAXORDER]; - struct vmem_seglist vm_seglist; - struct vmem_hashlist *vm_hashlist; - vmem_size_t vm_hashsize; - - /* Constant after init */ - vmem_size_t vm_qcache_max; - vmem_size_t vm_quantum_mask; - vmem_size_t vm_import_quantum; - int vm_quantum_shift; - - /* Written on alloc/free */ - LIST_HEAD(, vmem_btag) vm_freetags; - int vm_nfreetags; - int vm_nbusytag; - vmem_size_t vm_inuse; - vmem_size_t vm_size; - vmem_size_t vm_limit; - - /* Used on import. */ - vmem_import_t *vm_importfn; - vmem_release_t *vm_releasefn; - void *vm_arg; - - /* Space exhaustion callback. */ - vmem_reclaim_t *vm_reclaimfn; - - /* quantum cache */ - qcache_t vm_qcache[VMEM_QCACHE_IDX_MAX]; -}; - /* boundary tag */ struct vmem_btag { TAILQ_ENTRY(vmem_btag) bt_seglist; Index: sys/sys/_vmem.h =================================================================== --- /dev/null +++ sys/sys/_vmem.h @@ -0,0 +1,101 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi, + * Copyright (c) 2013 EMC Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS__VMEM_H_ +#define _SYS__VMEM_H_ + +#define QC_NAME_MAX 16 + +struct qcache { + uma_zone_t qc_cache; + vmem_t *qc_vmem; + vmem_size_t qc_size; + char qc_name[QC_NAME_MAX]; +}; + +typedef struct qcache qcache_t; + +TAILQ_HEAD(vmem_seglist, vmem_btag); +LIST_HEAD(vmem_freelist, vmem_btag); +LIST_HEAD(vmem_hashlist, vmem_btag); + +#define VMEM_OPTORDER 5 +#define VMEM_OPTVALUE (1 << VMEM_OPTORDER) +#define VMEM_MAXORDER \ + (VMEM_OPTVALUE - 1 + sizeof(vmem_size_t) * NBBY - VMEM_OPTORDER) + +#define VMEM_QCACHE_IDX_MAX 16 + +#define VMEM_HASHSIZE_MIN 16 +#define VMEM_HASHSIZE_MAX 131072 + +#define VMEM_NAME_MAX 16 + +/* vmem arena */ +struct vmem { + struct mtx_padalign vm_lock; + struct cv vm_cv; + char vm_name[VMEM_NAME_MAX+1]; + LIST_ENTRY(vmem) vm_alllist; + struct vmem_hashlist vm_hash0[VMEM_HASHSIZE_MIN]; + struct vmem_freelist vm_freelist[VMEM_MAXORDER]; + struct vmem_seglist vm_seglist; + struct vmem_hashlist *vm_hashlist; + vmem_size_t vm_hashsize; + + /* Constant after init */ + vmem_size_t vm_qcache_max; + vmem_size_t vm_quantum_mask; + vmem_size_t vm_import_quantum; + int vm_quantum_shift; + + /* Written on alloc/free */ + LIST_HEAD(, vmem_btag) vm_freetags; + int vm_nfreetags; + int vm_nbusytag; + vmem_size_t vm_inuse; + vmem_size_t vm_size; + vmem_size_t vm_limit; + + /* Used on import. */ + vmem_import_t *vm_importfn; + vmem_release_t *vm_releasefn; + void *vm_arg; + + /* Space exhaustion callback. */ + vmem_reclaim_t *vm_reclaimfn; + + /* quantum cache */ + qcache_t vm_qcache[VMEM_QCACHE_IDX_MAX]; +}; + + +#endif