Index: libexec/rtld-elf/map_object.c =================================================================== --- libexec/rtld-elf/map_object.c +++ libexec/rtld-elf/map_object.c @@ -208,7 +208,9 @@ base_addr = (caddr_t) base_vaddr; base_flags = __getosreldate() >= P_OSREL_MAP_GUARD ? MAP_GUARD : MAP_PRIVATE | MAP_ANON | MAP_NOCORE; - if (npagesizes > 1 && rtld_round_page(segs[0]->p_filesz) >= pagesizes[1]) + if (npagesizes > 1 && rtld_round_page(segs[0]->p_filesz) >= pagesizes[1] && + rtld_round_page(segs[nsegs]->p_offset + segs[nsegs]->p_filesz) >= + pagesizes[MAX(1, npagesizes - 2)]) base_flags |= MAP_ALIGNED_SUPER; if (base_vaddr != 0) base_flags |= MAP_FIXED | MAP_EXCL; Index: share/man/man7/arch.7 =================================================================== --- share/man/man7/arch.7 +++ share/man/man7/arch.7 @@ -224,7 +224,7 @@ .Ss Page Size .Bl -column -offset indent "Architecture" "Page Sizes" .It Sy Architecture Ta Sy Page Sizes -.It aarch64 Ta 4K, 2M, 1G +.It aarch64 Ta 4K, 64K, 2M, 1G .It amd64 Ta 4K, 2M, 1G .It armv6 Ta 4K, 1M .It armv7 Ta 4K, 1M Index: sys/arm64/arm64/pmap.c =================================================================== --- sys/arm64/arm64/pmap.c +++ sys/arm64/arm64/pmap.c @@ -1646,11 +1646,14 @@ if (superpages_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); - pagesizes[1] = L2_SIZE; + pagesizes[1] = L3C_SIZE; + KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, + ("pmap_init: can't assign to pagesizes[2]")); + pagesizes[2] = L2_SIZE; if (L1_BLOCKS_SUPPORTED) { - KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, - ("pmap_init: can't assign to pagesizes[2]")); - pagesizes[2] = L1_SIZE; + KASSERT(MAXPAGESIZES > 3 && pagesizes[3] == 0, + ("pmap_init: can't assign to pagesizes[3]")); + pagesizes[3] = L1_SIZE; } } @@ -4959,7 +4962,7 @@ pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t pte, int flags, int psind) { - pd_entry_t *l0p, *l1p, *l2p, newpte, origpte; + pd_entry_t *l0p, *l1p, *l2p, *l3p, newpte, origpte, *tl3p; vm_page_t mp; PMAP_LOCK_ASSERT(pmap, MA_OWNED); @@ -4973,7 +4976,7 @@ newpte = pte; if (!pmap_bti_same(pmap, va, va + pagesizes[psind], &newpte)) return (KERN_PROTECTION_FAILURE); - if (psind == 2) { + if (psind == 3) { PMAP_ASSERT_L1_BLOCKS_SUPPORTED; l0p = pmap_l0(pmap, va); @@ -5005,7 +5008,7 @@ ("va %#lx changing 1G phys page l1 %#lx newpte %#lx", va, origpte, newpte)); pmap_store(l1p, newpte); - } else /* (psind == 1) */ { + } else if (psind == 2) { l2p = pmap_l2(pmap, va); if (l2p == NULL) { mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL); @@ -5034,6 +5037,63 @@ ("va %#lx changing 2M phys page l2 %#lx newpte %#lx", va, origpte, newpte)); pmap_store(l2p, newpte); + } else /* (psind == 1) */ { + l1p = pmap_l1(pmap, va); + if (l1p != NULL && pmap_load(l1p) != 0) { + KASSERT((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_BLOCK, + ("va %#lx unexpected 1G phys page l1 %#lx", + va, pmap_load(l1p))); + l2p = pmap_l1_to_l2(l1p, va); + if (pmap_load(l2p) != 0) { + KASSERT((pmap_load(l2p) & ATTR_DESCR_MASK) != + L2_BLOCK, + ("va %#lx unexpected 2M phys page l2 %#lx", + va, pmap_load(l2p))); + l3p = pmap_l2_to_l3(l2p, va); + if (pmap_load(l3p) == 0) { + mp = PTE_TO_VM_PAGE(pmap_load(l2p)); + mp->ref_count += L3C_ENTRIES; + } + } else { + mp = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), + NULL); + if (mp == NULL) { + if ((flags & PMAP_ENTER_NOSLEEP) != 0) + return (KERN_RESOURCE_SHORTAGE); + PMAP_UNLOCK(pmap); + vm_wait(NULL); + PMAP_LOCK(pmap); + goto restart; + } + mp->ref_count += L3C_ENTRIES - 1; + l3p = (pd_entry_t *)PHYS_TO_DMAP( + VM_PAGE_TO_PHYS(mp)); + l3p = &l3p[pmap_l3_index(va)]; + } + } else { + mp = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), NULL); + if (mp == NULL) { + if ((flags & PMAP_ENTER_NOSLEEP) != 0) + return (KERN_RESOURCE_SHORTAGE); + PMAP_UNLOCK(pmap); + vm_wait(NULL); + PMAP_LOCK(pmap); + goto restart; + } + mp->ref_count += L3C_ENTRIES - 1; + l3p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); + l3p = &l3p[pmap_l3_index(va)]; + } + for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { + origpte = pmap_load(tl3p); + KASSERT((origpte & ATTR_DESCR_VALID) == 0 || + ((origpte & ATTR_CONTIGUOUS) != 0 && + PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)), + ("va %#lx changing 64K phys page l3 %#lx newpte %#lx", + va, origpte, newpte)); + pmap_store(tl3p, newpte); + newpte += L3_SIZE; + } } dsb(ishst); @@ -5072,7 +5132,7 @@ vm_paddr_t opa, pa; vm_page_t mpte, om; bool nosleep; - int lvl, rv; + int full_lvl, lvl, rv; KASSERT(ADDR_IS_CANONICAL(va), ("%s: Address not in canonical form: %lx", __func__, va)); @@ -5129,23 +5189,45 @@ KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed largepage va %#lx flags %#x", va, flags)); new_l3 &= ~L3_PAGE; - if (psind == 2) { + if (psind == 3) { PMAP_ASSERT_L1_BLOCKS_SUPPORTED; new_l3 |= L1_BLOCK; - } else /* (psind == 1) */ + } else if (psind == 2) new_l3 |= L2_BLOCK; + else /* (psind == 1) */ + new_l3 |= ATTR_CONTIGUOUS; rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind); goto out; } - if (psind == 1) { + if (psind == 2) { /* Assert the required virtual and physical alignment. */ KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); - KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); + KASSERT(m->psind > 1, ("pmap_enter: m->psind < psind")); rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, flags, m, &lock); goto out; } mpte = NULL; + if (psind == 1) { + KASSERT((va & L3C_OFFSET) == 0, ("pmap_enter: va unaligned")); + KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); + rv = pmap_enter_l3c(pmap, va, new_l3 | ATTR_CONTIGUOUS, flags, + m, &mpte, &lock); +#if VM_NRESERVLEVEL > 0 + /* + * Attempt L2 promotion, if both a level 1 reservation and the + * PTP are fully populated. + */ + if (rv == KERN_SUCCESS && + (mpte == NULL || mpte->ref_count == NL3PG) && + (m->flags & PG_FICTITIOUS) == 0 && + vm_reserv_level_iffullpop(m) == 1) { + pde = pmap_l2(pmap, va); + (void)pmap_promote_l2(pmap, pde, va, mpte, &lock); + } +#endif + goto out; + } /* * In the case that a page table page is not @@ -5365,15 +5447,14 @@ * are aligned with each other and an underlying reservation has the * neighboring L3 pages allocated. The first condition is simply an * optimization that recognizes some eventual promotion failures early - * at a lower run-time cost. Then, if both the page table page and - * the reservation are fully populated, attempt L2 promotion. + * at a lower run-time cost. Then, if both a level 1 reservation and + * the PTP are fully populated, attempt L2 promotion. */ if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) && (m->flags & PG_FICTITIOUS) == 0 && - vm_reserv_is_populated(m, L3C_ENTRIES) && + (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 && pmap_promote_l3c(pmap, l3, va) && - (mpte == NULL || mpte->ref_count == NL3PG) && - vm_reserv_level_iffullpop(m) == 0) + full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG)) (void)pmap_promote_l2(pmap, pde, va, mpte, &lock); #endif @@ -5665,6 +5746,8 @@ ("pmap_enter_l3c: va is not aligned")); KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0, ("pmap_enter_l3c: managed mapping within the clean submap")); + KASSERT((l3e & ATTR_CONTIGUOUS) != 0, + ("pmap_enter_l3c: l3e is missing ATTR_CONTIGUOUS")); /* * If the L3 PTP is not resident, we attempt to create it here. @@ -5871,14 +5954,12 @@ while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && - m->psind == 1 && pmap_ps_enabled(pmap) && + m->psind == 2 && pmap_ps_enabled(pmap) && ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE)) m = &m[L2_SIZE / PAGE_SIZE - 1]; else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end && - (VM_PAGE_TO_PHYS(m) & L3C_OFFSET) == 0 && - vm_reserv_is_populated(m, L3C_ENTRIES) && - pmap_ps_enabled(pmap) && + m->psind >= 1 && pmap_ps_enabled(pmap) && ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot, &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE)) m = &m[L3C_ENTRIES - 1]; @@ -5930,7 +6011,7 @@ { pt_entry_t *l1, *l2, *l3, l3_val; vm_paddr_t pa; - int lvl; + int full_lvl, lvl; KASSERT(!VA_IS_CLEANMAP(va) || (m->oflags & VPO_UNMANAGED) != 0, @@ -6061,18 +6142,17 @@ * are aligned with each other and an underlying reservation has the * neighboring L3 pages allocated. The first condition is simply an * optimization that recognizes some eventual promotion failures early - * at a lower run-time cost. Then, attempt L2 promotion, if both the - * PTP and the reservation are fully populated. + * at a lower run-time cost. Then, attempt L2 promotion, if both a + * level 1 reservation and the PTP are fully populated. */ if ((prot & VM_PROT_NO_PROMOTE) == 0 && (va & L3C_OFFSET) == (pa & L3C_OFFSET) && (m->flags & PG_FICTITIOUS) == 0 && - vm_reserv_is_populated(m, L3C_ENTRIES) && + (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 && pmap_promote_l3c(pmap, l3, va) && - (mpte == NULL || mpte->ref_count == NL3PG) && - vm_reserv_level_iffullpop(m) == 0) { + full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG)) { if (l2 == NULL) - l2 = pmap_pde(pmap, va, &lvl); + l2 = pmap_l2(pmap, va); /* * If promotion succeeds, then the next call to this function @@ -8564,7 +8644,7 @@ { pt_entry_t *pte, tpte; vm_paddr_t mask, pa; - int lvl, val; + int lvl, psind, val; bool managed; PMAP_ASSERT_STAGE1(pmap); @@ -8576,21 +8656,22 @@ switch (lvl) { case 3: mask = L3_OFFSET; + psind = (tpte & ATTR_CONTIGUOUS) != 0 ? 1 : 0; break; case 2: mask = L2_OFFSET; + psind = 2; break; case 1: mask = L1_OFFSET; + psind = 3; break; default: panic("pmap_mincore: invalid level %d", lvl); } managed = (tpte & ATTR_SW_MANAGED) != 0; - val = MINCORE_INCORE; - if (lvl != 3) - val |= MINCORE_PSIND(3 - lvl); + val = MINCORE_INCORE | MINCORE_PSIND(psind); if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed && (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; @@ -9126,18 +9207,37 @@ { vm_offset_t superpage_offset; - if (size < L2_SIZE) + if (size < L3C_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); + + /* + * Considering the object's physical alignment, is the mapping large + * enough to encompass an L2 (2MB/32MB) superpage ... + */ superpage_offset = offset & L2_OFFSET; - if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || - (*addr & L2_OFFSET) == superpage_offset) + if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) >= L2_SIZE) { + /* + * If the virtual and physical alignments differ, then + * increase the virtual address so that the alignments match. + */ + if ((*addr & L2_OFFSET) < superpage_offset) + *addr = (*addr & ~L2_OFFSET) + superpage_offset; + else if ((*addr & L2_OFFSET) > superpage_offset) + *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + + superpage_offset; return; - if ((*addr & L2_OFFSET) < superpage_offset) - *addr = (*addr & ~L2_OFFSET) + superpage_offset; - else - *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; + } + /* ... or an L3C (64KB/2MB) superpage? */ + superpage_offset = offset & L3C_OFFSET; + if (size - ((L3C_SIZE - superpage_offset) & L3C_OFFSET) >= L3C_SIZE) { + if ((*addr & L3C_OFFSET) < superpage_offset) + *addr = (*addr & ~L3C_OFFSET) + superpage_offset; + else if ((*addr & L3C_OFFSET) > superpage_offset) + *addr = ((*addr + L3C_OFFSET) & ~L3C_OFFSET) + + superpage_offset; + } } /** Index: sys/arm64/include/param.h =================================================================== --- sys/arm64/include/param.h +++ sys/arm64/include/param.h @@ -97,7 +97,7 @@ #define PAGE_SIZE (1 << PAGE_SHIFT) #define PAGE_MASK (PAGE_SIZE - 1) -#define MAXPAGESIZES 3 /* maximum number of supported page sizes */ +#define MAXPAGESIZES 4 /* maximum number of supported page sizes */ #ifndef KSTACK_PAGES #if defined(KASAN) || defined(KMSAN) Index: sys/arm64/include/vmparam.h =================================================================== --- sys/arm64/include/vmparam.h +++ sys/arm64/include/vmparam.h @@ -112,25 +112,34 @@ #endif /* - * Enable superpage reservations: 1 level. + * Enable superpage reservations: 2 levels. */ #ifndef VM_NRESERVLEVEL -#define VM_NRESERVLEVEL 1 +#define VM_NRESERVLEVEL 2 #endif /* - * Level 0 reservations consist of 512 pages when PAGE_SIZE is 4KB, and - * 2048 pages when PAGE_SIZE is 16KB. + * Level 0 reservations consist of 16 pages when PAGE_SIZE is 4KB, and 128 + * pages when PAGE_SIZE is 16KB. Level 1 reservations consist of 32 64KB + * pages when PAGE_SIZE is 4KB, and 16 2M pages when PAGE_SIZE is 16KB. */ -#ifndef VM_LEVEL_0_ORDER #if PAGE_SIZE == PAGE_SIZE_4K -#define VM_LEVEL_0_ORDER 9 +#ifndef VM_LEVEL_0_ORDER +#define VM_LEVEL_0_ORDER 4 +#endif +#ifndef VM_LEVEL_1_ORDER +#define VM_LEVEL_1_ORDER 5 +#endif #elif PAGE_SIZE == PAGE_SIZE_16K -#define VM_LEVEL_0_ORDER 11 +#ifndef VM_LEVEL_0_ORDER +#define VM_LEVEL_0_ORDER 7 +#endif +#ifndef VM_LEVEL_1_ORDER +#define VM_LEVEL_1_ORDER 4 +#endif #else #error Unsupported page size #endif -#endif /** * Address space layout. Index: sys/kern/imgact_elf.c =================================================================== --- sys/kern/imgact_elf.c +++ sys/kern/imgact_elf.c @@ -1360,8 +1360,12 @@ if ((map->flags & MAP_ASLR) != 0) { maxv1 = maxv / 2 + addr / 2; error = __CONCAT(rnd_, __elfN(base))(map, addr, maxv1, - (MAXPAGESIZES > 1 && pagesizes[1] != 0) ? - pagesizes[1] : pagesizes[0], &anon_loc); +#if VM_NRESERVLEVEL > 0 + (MAXPAGESIZES > VM_NRESERVLEVEL && + pagesizes[VM_NRESERVLEVEL] != 0) ? + pagesizes[VM_NRESERVLEVEL] : +#endif + pagesizes[0], &anon_loc); if (error != 0) goto ret; map->anon_loc = anon_loc; Index: sys/kern/kern_mib.c =================================================================== --- sys/kern/kern_mib.c +++ sys/kern/kern_mib.c @@ -242,7 +242,7 @@ SYSCTL_LONG(_hw, OID_AUTO, availpages, CTLFLAG_RD, &physmem, 0, "Amount of physical memory (in pages)"); -u_long pagesizes[MAXPAGESIZES] = { PAGE_SIZE }; +u_long __read_mostly pagesizes[MAXPAGESIZES] = { PAGE_SIZE }; static int sysctl_hw_pagesizes(SYSCTL_HANDLER_ARGS) Index: sys/kern/kern_proc.c =================================================================== --- sys/kern/kern_proc.c +++ sys/kern/kern_proc.c @@ -2542,6 +2542,7 @@ vm_offset_t addr; vm_paddr_t pa; vm_pindex_t pi, pi_adv, pindex; + int incore; *super = false; *resident_count = 0; @@ -2577,10 +2578,15 @@ } m_adv = NULL; if (m->psind != 0 && addr + pagesizes[1] <= entry->end && - (addr & (pagesizes[1] - 1)) == 0 && - (pmap_mincore(map->pmap, addr, &pa) & MINCORE_SUPER) != 0) { + (addr & (pagesizes[1] - 1)) == 0 && (incore = + pmap_mincore(map->pmap, addr, &pa) & MINCORE_SUPER) != 0) { *super = true; - pi_adv = atop(pagesizes[1]); + /* + * The virtual page might be smaller than the physical + * page, so we use the page size reported by the pmap + * rather than m->psind. + */ + pi_adv = atop(pagesizes[incore >> MINCORE_PSIND_SHIFT]); } else { /* * We do not test the found page on validity. Index: sys/kern/uipc_shm.c =================================================================== --- sys/kern/uipc_shm.c +++ sys/kern/uipc_shm.c @@ -1589,9 +1589,16 @@ if (align == 0) { align = pagesizes[shmfd->shm_lp_psind]; } else if (align == MAP_ALIGNED_SUPER) { - if (shmfd->shm_lp_psind != 1) + /* shmfd->shm_lp_psind < 1 is handled above. */ + if ( +#if VM_NRESERVLEVEL > 0 + shmfd->shm_lp_psind > VM_NRESERVLEVEL +#else + shmfd->shm_lp_psind > 1 +#endif + ) return (EINVAL); - align = pagesizes[1]; + align = pagesizes[shmfd->shm_lp_psind]; } else { align >>= MAP_ALIGNMENT_SHIFT; align = 1ULL << align; Index: sys/sys/mman.h =================================================================== --- sys/sys/mman.h +++ sys/sys/mman.h @@ -175,7 +175,9 @@ #define MINCORE_REFERENCED_OTHER 0x8 /* Page has been referenced */ #define MINCORE_MODIFIED_OTHER 0x10 /* Page has been modified */ #define MINCORE_SUPER 0x60 /* Page is a "super" page */ -#define MINCORE_PSIND(i) (((i) << 5) & MINCORE_SUPER) /* Page size */ +#define MINCORE_PSIND_SHIFT 5 +#define MINCORE_PSIND(i) (((i) << MINCORE_PSIND_SHIFT) & MINCORE_SUPER) + /* Page size */ /* * Anonymous object constant for shm_open(). Index: sys/vm/vm_domainset.c =================================================================== --- sys/vm/vm_domainset.c +++ sys/vm/vm_domainset.c @@ -77,6 +77,9 @@ * reservation boundary. */ pindex += obj->pg_color; +#if VM_NRESERVLEVEL > 1 + pindex >>= VM_LEVEL_1_ORDER; +#endif pindex >>= VM_LEVEL_0_ORDER; } else #endif Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -380,31 +380,50 @@ psind = 0; #if VM_NRESERVLEVEL > 0 if ((m->flags & PG_FICTITIOUS) == 0 && - (m_super = vm_reserv_to_superpage(m)) != NULL && - rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start && - roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end && - (vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) & - (pagesizes[m_super->psind] - 1)) && - pmap_ps_enabled(fs->map->pmap)) { - flags = PS_ALL_VALID; - if ((fs->prot & VM_PROT_WRITE) != 0) { - /* - * Create a superpage mapping allowing write access - * only if none of the constituent pages are busy and - * all of them are already dirty (except possibly for - * the page that was faulted on). - */ - flags |= PS_NONE_BUSY; - if ((fs->first_object->flags & OBJ_UNMANAGED) == 0) - flags |= PS_ALL_DIRTY; - } - if (vm_page_ps_test(m_super, flags, m)) { - m_map = m_super; - psind = m_super->psind; - vaddr = rounddown2(vaddr, pagesizes[psind]); - /* Preset the modified bit for dirty superpages. */ - if ((flags & PS_ALL_DIRTY) != 0) - fs->fault_type |= VM_PROT_WRITE; + (m_super = vm_reserv_to_superpage(m)) != NULL) { + psind = m_super->psind; + KASSERT(psind > 0, + ("psind %d of m_super %p < 1", psind, m_super)); + for (;;) { + if (rounddown2(vaddr, pagesizes[psind]) >= + fs->entry->start && roundup2(vaddr + 1, + pagesizes[psind]) <= fs->entry->end && + (vaddr & (pagesizes[psind] - 1)) == + (VM_PAGE_TO_PHYS(m) & (pagesizes[psind] - 1)) && + pmap_ps_enabled(fs->map->pmap)) { + flags = PS_ALL_VALID; + if ((fs->prot & VM_PROT_WRITE) != 0) { + /* + * Create a superpage mapping allowing + * write access only if none of the + * constituent pages are busy and all + * of them are already dirty (except + * possibly for the page that was + * faulted on). + */ + flags |= PS_NONE_BUSY; + if ((fs->first_object->flags & + OBJ_UNMANAGED) == 0) + flags |= PS_ALL_DIRTY; + } + if (vm_page_ps_test(m_super, psind, flags, m)) { + m_map = m_super; + vaddr = rounddown2(vaddr, + pagesizes[psind]); + /* + * Preset the modified bit for dirty + * superpages. + */ + if ((flags & PS_ALL_DIRTY) != 0) + fs->fault_type |= VM_PROT_WRITE; + break; + } + } + psind--; + if (psind == 0) + break; + m_super += rounddown2(m - m_super, + atop(pagesizes[psind])); } } #endif @@ -615,10 +634,13 @@ vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset; psind = m->psind; - if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 || - pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last || - !pmap_ps_enabled(fs->map->pmap))) - psind = 0; + while (psind > 0) { + if ((vaddr & (pagesizes[psind] - 1)) == 0 && pidx + + OFF_TO_IDX(pagesizes[psind]) - 1 <= pager_last && + pmap_ps_enabled(fs->map->pmap)) + break; + psind--; + } npages = atop(pagesizes[psind]); for (i = 0; i < npages; i++) { Index: sys/vm/vm_glue.c =================================================================== --- sys/vm/vm_glue.c +++ sys/vm/vm_glue.c @@ -106,7 +106,10 @@ #include -#if VM_NRESERVLEVEL > 0 +#if VM_NRESERVLEVEL > 1 +#define KVA_KSTACK_QUANTUM_SHIFT (VM_LEVEL_1_ORDER + VM_LEVEL_0_ORDER + \ + PAGE_SHIFT) +#elif VM_NRESERVLEVEL > 0 #define KVA_KSTACK_QUANTUM_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT) #else #define KVA_KSTACK_QUANTUM_SHIFT (8 + PAGE_SHIFT) Index: sys/vm/vm_kern.c =================================================================== --- sys/vm/vm_kern.c +++ sys/vm/vm_kern.c @@ -120,7 +120,10 @@ #endif "Max kernel address"); -#if VM_NRESERVLEVEL > 0 +#if VM_NRESERVLEVEL > 1 +#define KVA_QUANTUM_SHIFT (VM_LEVEL_1_ORDER + VM_LEVEL_0_ORDER + \ + PAGE_SHIFT) +#elif VM_NRESERVLEVEL > 0 #define KVA_QUANTUM_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT) #else /* On non-superpage architectures we want large import sizes. */ Index: sys/vm/vm_map.c =================================================================== --- sys/vm/vm_map.c +++ sys/vm/vm_map.c @@ -1993,8 +1993,8 @@ return (result); } -static const int aslr_pages_rnd_64[2] = {0x1000, 0x10}; -static const int aslr_pages_rnd_32[2] = {0x100, 0x4}; +static const int aslr_pages_rnd_64[3] = {0x1000, 0x1000, 0x10}; +static const int aslr_pages_rnd_32[3] = {0x100, 0x100, 0x4}; static int cluster_anon = 1; SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW, @@ -2190,9 +2190,14 @@ * Find space for allocation, including * gap needed for later randomization. */ - pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 && - (find_space == VMFS_SUPER_SPACE || find_space == - VMFS_OPTIMAL_SPACE) ? 1 : 0; + pidx = 0; +#if VM_NRESERVLEVEL > 0 + if ((find_space == VMFS_SUPER_SPACE || + find_space == VMFS_OPTIMAL_SPACE) && + MAXPAGESIZES > VM_NRESERVLEVEL && + pagesizes[VM_NRESERVLEVEL] != 0) + pidx = VM_NRESERVLEVEL; +#endif gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR && (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ? aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx]; @@ -2656,6 +2661,7 @@ vm_offset_t start; vm_page_t p, p_start; vm_pindex_t mask, psize, threshold, tmpidx; + int psind; if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL) return; @@ -2710,13 +2716,17 @@ p_start = p; } /* Jump ahead if a superpage mapping is possible. */ - if (p->psind > 0 && ((addr + ptoa(tmpidx)) & - (pagesizes[p->psind] - 1)) == 0) { - mask = atop(pagesizes[p->psind]) - 1; - if (tmpidx + mask < psize && - vm_page_ps_test(p, PS_ALL_VALID, NULL)) { - p += mask; - threshold += mask; + for (psind = p->psind; psind > 0; psind--) { + if (((addr + ptoa(tmpidx)) & + (pagesizes[psind] - 1)) == 0) { + mask = atop(pagesizes[psind]) - 1; + if (tmpidx + mask < psize && + vm_page_ps_test(p, psind, + PS_ALL_VALID, NULL)) { + p += mask; + threshold += mask; + break; + } } } } else if (p_start != NULL) { Index: sys/vm/vm_page.h =================================================================== --- sys/vm/vm_page.h +++ sys/vm/vm_page.h @@ -657,7 +657,7 @@ bool vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new); vm_page_t vm_page_prev(vm_page_t m); -bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m); +bool vm_page_ps_test(vm_page_t m, int psind, int flags, vm_page_t skip_m); void vm_page_putfake(vm_page_t m); void vm_page_readahead_finish(vm_page_t m); int vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c +++ sys/vm/vm_page.c @@ -5507,7 +5507,7 @@ * (super)page and false otherwise. */ bool -vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m) +vm_page_ps_test(vm_page_t m, int psind, int flags, vm_page_t skip_m) { vm_object_t object; int i, npages; @@ -5516,7 +5516,9 @@ if (skip_m != NULL && skip_m->object != object) return (false); VM_OBJECT_ASSERT_LOCKED(object); - npages = atop(pagesizes[m->psind]); + KASSERT(psind <= m->psind, + ("psind %d > psind %d of m %p", psind, m->psind, m)); + npages = atop(pagesizes[psind]); /* * The physically contiguous pages that make up a superpage, i.e., a Index: sys/vm/vm_reserv.c =================================================================== --- sys/vm/vm_reserv.c +++ sys/vm/vm_reserv.c @@ -77,6 +77,29 @@ #if VM_NRESERVLEVEL > 0 +/* + * Temporarily simulate two-level reservations. Effectively, VM_LEVEL_0_* is + * level 1, and VM_SUBLEVEL_0_* is level 0. + */ +#if VM_NRESERVLEVEL == 2 +#undef VM_NRESERVLEVEL +#define VM_NRESERVLEVEL 1 +#if VM_LEVEL_0_ORDER == 4 +#undef VM_LEVEL_0_ORDER +#define VM_LEVEL_0_ORDER (4 + VM_LEVEL_1_ORDER) +#define VM_SUBLEVEL_0_NPAGES (1 << 4) +#elif VM_LEVEL_0_ORDER == 7 +#undef VM_LEVEL_0_ORDER +#define VM_LEVEL_0_ORDER (7 + VM_LEVEL_1_ORDER) +#define VM_SUBLEVEL_0_NPAGES (1 << 7) +#else +#error "Unsupported level 0 reservation size" +#endif +#define VM_LEVEL_0_PSIND 2 +#else +#define VM_LEVEL_0_PSIND 1 +#endif + #ifndef VM_LEVEL_0_ORDER_MAX #define VM_LEVEL_0_ORDER_MAX VM_LEVEL_0_ORDER #endif @@ -381,6 +404,27 @@ vm_reserv_object_unlock(object); } +#ifdef VM_SUBLEVEL_0_NPAGES +static inline bool +vm_reserv_is_sublevel_full(vm_reserv_t rv, int index) +{ + _Static_assert(VM_SUBLEVEL_0_NPAGES == 16 || + VM_SUBLEVEL_0_NPAGES == 128, + "vm_reserv_is_sublevel_full: unsupported VM_SUBLEVEL_0_NPAGES"); + /* An equivalent bit_ntest() compiles to more instructions. */ + switch (VM_SUBLEVEL_0_NPAGES) { + case 16: + return (((uint16_t *)rv->popmap)[index / 16] == UINT16_MAX); + case 128: + index = rounddown2(index, 128) / 64; + return (((uint64_t *)rv->popmap)[index] == UINT64_MAX && + ((uint64_t *)rv->popmap)[index + 1] == UINT64_MAX); + default: + __unreachable(); + } +} +#endif + /* * Reduces the given reservation's population count. If the population count * becomes zero, the reservation is destroyed. Additionally, moves the @@ -406,11 +450,15 @@ ("vm_reserv_depopulate: reserv %p's domain is corrupted %d", rv, rv->domain)); if (rv->popcnt == VM_LEVEL_0_NPAGES) { - KASSERT(rv->pages->psind == 1, + KASSERT(rv->pages->psind == VM_LEVEL_0_PSIND, ("vm_reserv_depopulate: reserv %p is already demoted", rv)); - rv->pages->psind = 0; + rv->pages->psind = VM_LEVEL_0_PSIND - 1; } +#ifdef VM_SUBLEVEL_0_NPAGES + if (vm_reserv_is_sublevel_full(rv, index)) + rv->pages[rounddown2(index, VM_SUBLEVEL_0_NPAGES)].psind = 0; +#endif bit_clear(rv->popmap, index); rv->popcnt--; if ((unsigned)(ticks - rv->lasttick) >= PARTPOPSLOP || @@ -522,12 +570,17 @@ index)); KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES, ("vm_reserv_populate: reserv %p is already full", rv)); - KASSERT(rv->pages->psind == 0, + KASSERT(rv->pages->psind >= 0 && + rv->pages->psind < VM_LEVEL_0_PSIND, ("vm_reserv_populate: reserv %p is already promoted", rv)); KASSERT(rv->domain < vm_ndomains, ("vm_reserv_populate: reserv %p's domain is corrupted %d", rv, rv->domain)); bit_set(rv->popmap, index); +#ifdef VM_SUBLEVEL_0_NPAGES + if (vm_reserv_is_sublevel_full(rv, index)) + rv->pages[rounddown2(index, VM_SUBLEVEL_0_NPAGES)].psind = 1; +#endif rv->popcnt++; if ((unsigned)(ticks - rv->lasttick) < PARTPOPSLOP && rv->inpartpopq && rv->popcnt != VM_LEVEL_0_NPAGES) @@ -542,10 +595,10 @@ rv->inpartpopq = TRUE; TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv, partpopq); } else { - KASSERT(rv->pages->psind == 0, + KASSERT(rv->pages->psind == VM_LEVEL_0_PSIND - 1, ("vm_reserv_populate: reserv %p is already promoted", rv)); - rv->pages->psind = 1; + rv->pages->psind = VM_LEVEL_0_PSIND; } vm_reserv_domain_unlock(rv->domain); } @@ -889,13 +942,18 @@ static void vm_reserv_break(vm_reserv_t rv) { + vm_page_t m; int hi, lo, pos; vm_reserv_assert_locked(rv); CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); vm_reserv_remove(rv); - rv->pages->psind = 0; + m = rv->pages; +#ifdef VM_SUBLEVEL_0_NPAGES + for (; m < rv->pages + VM_LEVEL_0_NPAGES; m += VM_SUBLEVEL_0_NPAGES) +#endif + m->psind = 0; hi = lo = -1; pos = 0; for (;;) { @@ -1089,7 +1147,11 @@ vm_reserv_t rv; rv = vm_reserv_from_page(m); +#ifdef VM_SUBLEVEL_0_NPAGES + return (rv->object != NULL ? 1 : -1); +#else return (rv->object != NULL ? 0 : -1); +#endif } /* @@ -1102,7 +1164,15 @@ vm_reserv_t rv; rv = vm_reserv_from_page(m); - return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1); + if (rv->popcnt == VM_LEVEL_0_NPAGES) { +#ifdef VM_SUBLEVEL_0_NPAGES + return (1); + } else if (rv->pages != NULL && + vm_reserv_is_sublevel_full(rv, m - rv->pages)) { +#endif + return (0); + } + return (-1); } /* @@ -1357,6 +1427,10 @@ switch (level) { case 0: +#ifdef VM_SUBLEVEL_0_NPAGES + return (VM_SUBLEVEL_0_NPAGES * PAGE_SIZE); + case 1: +#endif return (VM_LEVEL_0_SIZE); case -1: return (PAGE_SIZE); @@ -1432,12 +1506,16 @@ VM_OBJECT_ASSERT_LOCKED(m->object); rv = vm_reserv_from_page(m); - if (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES) - m = rv->pages; - else - m = NULL; - - return (m); + if (rv->object == m->object) { + if (rv->popcnt == VM_LEVEL_0_NPAGES) + return (rv->pages); +#ifdef VM_SUBLEVEL_0_NPAGES + if (vm_reserv_is_sublevel_full(rv, m - rv->pages)) + return (rv->pages + rounddown2(m - rv->pages, + VM_SUBLEVEL_0_NPAGES)); +#endif + } + return (NULL); } #endif /* VM_NRESERVLEVEL > 0 */