Index: sys/powerpc/aim/mmu_oea64.h =================================================================== --- sys/powerpc/aim/mmu_oea64.h +++ sys/powerpc/aim/mmu_oea64.h @@ -82,6 +82,9 @@ int64_t moea64_pte_unset(struct pvo_entry *); int64_t moea64_pte_clear(struct pvo_entry *, uint64_t); int64_t moea64_pte_synch(struct pvo_entry *); +int64_t moea64_pte_insert_sp(struct pvo_entry *); +int64_t moea64_pte_unset_sp(struct pvo_entry *); +int64_t moea64_pte_replace_sp(struct pvo_entry *); typedef int64_t (*moea64_pte_replace_t)(struct pvo_entry *, int); @@ -89,6 +92,9 @@ typedef int64_t (*moea64_pte_unset_t)(struct pvo_entry *); typedef int64_t (*moea64_pte_clear_t)(struct pvo_entry *, uint64_t); typedef int64_t (*moea64_pte_synch_t)(struct pvo_entry *); +typedef int64_t (*moea64_pte_insert_sp_t)(struct pvo_entry *); +typedef int64_t (*moea64_pte_unset_sp_t)(struct pvo_entry *); +typedef int64_t (*moea64_pte_replace_sp_t)(struct pvo_entry *); struct moea64_funcs { moea64_pte_replace_t pte_replace; @@ -96,6 +102,9 @@ moea64_pte_unset_t pte_unset; moea64_pte_clear_t pte_clear; moea64_pte_synch_t pte_synch; + moea64_pte_insert_sp_t pte_insert_sp; + moea64_pte_unset_sp_t pte_unset_sp; + moea64_pte_replace_sp_t pte_replace_sp; }; extern struct moea64_funcs *moea64_ops; @@ -129,6 +138,7 @@ extern u_long moea64_pteg_count; extern u_long moea64_pteg_mask; extern int n_slbs; +extern bool moea64_has_lp_4k_16m; #endif /* _POWERPC_AIM_MMU_OEA64_H */ Index: sys/powerpc/aim/mmu_oea64.c =================================================================== --- sys/powerpc/aim/mmu_oea64.c +++ sys/powerpc/aim/mmu_oea64.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include @@ -108,9 +109,6 @@ #define VSID_TO_HASH(vsid) (((vsid) >> 4) & 0xfffff) #define VSID_HASH_MASK 0x0000007fffffffffULL -/* Get physical address from PVO. */ -#define PVO_PADDR(pvo) ((pvo)->pvo_pte.pa & LPTE_RPGN) - /* * Locking semantics: * @@ -231,6 +229,7 @@ uint64_t moea64_large_page_mask = 0; uint64_t moea64_large_page_size = 0; int moea64_large_page_shift = 0; +bool moea64_has_lp_4k_16m = false; /* * PVO calls. @@ -252,6 +251,95 @@ static void moea64_syncicache(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_size_t sz); static void moea64_pmap_init_qpages(void); +static void moea64_remove_locked(pmap_t, vm_offset_t, + vm_offset_t, struct pvo_dlist *); + +/* + * Superpages data and routines. + */ + +/* + * PVO flags (in vaddr) that must match for promotion to succeed. + * Note that protection bits are checked separately, as they reside in + * another field. + */ +#define PVO_FLAGS_PROMOTE (PVO_WIRED | PVO_MANAGED | PVO_PTEGIDX_VALID) + +#define PVO_IS_SP(pvo) (((pvo)->pvo_vaddr & PVO_LARGE) && \ + (pvo)->pvo_pmap != kernel_pmap) + +/* Get physical address from PVO. */ +#define PVO_PADDR(pvo) moea64_pvo_paddr(pvo) + +/* MD page flag indicating that the page is a superpage. */ +#define MDPG_ATTR_SP 0x40000000 + +static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, + "VM/pmap parameters"); + +static int superpages_enabled = 0; +SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, CTLFLAG_RDTUN, + &superpages_enabled, 0, "Enable support for transparent superpages"); + +static SYSCTL_NODE(_vm_pmap, OID_AUTO, sp, CTLFLAG_RD, 0, + "SP page mapping counters"); + +static u_long sp_demotions; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, demotions, CTLFLAG_RD, + &sp_demotions, 0, "SP page demotions"); + +static u_long sp_mappings; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, mappings, CTLFLAG_RD, + &sp_mappings, 0, "SP page mappings"); + +static u_long sp_p_failures; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_failures, CTLFLAG_RD, + &sp_p_failures, 0, "SP page promotion failures"); + +static u_long sp_p_fail_pa; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_pa, CTLFLAG_RD, + &sp_p_fail_pa, 0, "SP page promotion failure: PAs don't match"); + +static u_long sp_p_fail_flags; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_flags, CTLFLAG_RD, + &sp_p_fail_flags, 0, "SP page promotion failure: page flags don't match"); + +static u_long sp_p_fail_prot; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_prot, CTLFLAG_RD, + &sp_p_fail_prot, 0, + "SP page promotion failure: page protections don't match"); + +static u_long sp_p_fail_wimg; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_wimg, CTLFLAG_RD, + &sp_p_fail_wimg, 0, "SP page promotion failure: WIMG bits don't match"); + +static u_long sp_promotions; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, promotions, CTLFLAG_RD, + &sp_promotions, 0, "SP page promotions"); + +static bool moea64_ps_enabled(pmap_t); +static void moea64_align_superpage(vm_object_t, vm_ooffset_t, + vm_offset_t *, vm_size_t); + +static int moea64_sp_enter(pmap_t pmap, vm_offset_t va, + vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind); +static struct pvo_entry *moea64_sp_remove(struct pvo_entry *sp, + struct pvo_dlist *tofree); + +static void moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m); +static void moea64_sp_demote_aligned(struct pvo_entry *sp); +static void moea64_sp_demote(struct pvo_entry *pvo); + +static struct pvo_entry *moea64_sp_unwire(struct pvo_entry *sp); +static struct pvo_entry *moea64_sp_protect(struct pvo_entry *sp, + vm_prot_t prot); + +static int64_t moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit); +static int64_t moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m, + uint64_t ptebit); + +static __inline bool moea64_sp_pvo_in_range(struct pvo_entry *pvo, + vm_offset_t sva, vm_offset_t eva); /* * Kernel MMU interface @@ -358,6 +446,8 @@ #ifdef __powerpc64__ .page_array_startup = moea64_page_array_startup, #endif + .ps_enabled = moea64_ps_enabled, + .align_superpage = moea64_align_superpage, /* Internal interfaces */ .mapdev = moea64_mapdev, @@ -377,6 +467,26 @@ MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods); +/* + * Get physical address from PVO. + * + * For superpages, the lower bits are not stored on pvo_pte.pa and must be + * obtained from VA. + */ +static __inline vm_paddr_t +moea64_pvo_paddr(struct pvo_entry *pvo) +{ + vm_paddr_t pa; + + pa = (pvo)->pvo_pte.pa & LPTE_RPGN; + + if (PVO_IS_SP(pvo)) { + pa &= ~SP_MASK; /* This is needed to clear LPTE_LP bits. */ + pa |= PVO_VADDR(pvo) & SP_MASK; + } + return (pa); +} + static struct pvo_head * vm_page_to_pvoh(vm_page_t m) { @@ -425,8 +535,10 @@ pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT) | (vsid << 16); - shift = (pvo->pvo_vaddr & PVO_LARGE) ? moea64_large_page_shift : - ADDR_PIDX_SHFT; + if (pmap == kernel_pmap && (pvo->pvo_vaddr & PVO_LARGE) != 0) + shift = moea64_large_page_shift; + else + shift = ADDR_PIDX_SHFT; hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift); pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3; } @@ -770,6 +882,9 @@ vm_paddr_t kernelphysstart, kernelphysend; int rm_pavail; + /* Level 0 reservations consist of 4096 pages (16MB superpage). */ + vm_level_0_order = 12; + #ifndef __powerpc64__ /* We don't have a direct map since there is no BAT */ hw_direct_map = 0; @@ -1201,6 +1316,17 @@ for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + if (PVO_IS_SP(pvo)) { + if (moea64_sp_pvo_in_range(pvo, sva, eva)) { + pvo = moea64_sp_unwire(pvo); + continue; + } else { + CTR1(KTR_PMAP, "%s: demote before unwire", + __func__); + moea64_sp_demote(pvo); + } + } + if ((pvo->pvo_vaddr & PVO_WIRED) == 0) panic("moea64_unwire: pvo %p is missing PVO_WIRED", pvo); @@ -1441,7 +1567,7 @@ moea64_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { - struct pvo_entry *pvo, *oldpvo; + struct pvo_entry *pvo, *oldpvo, *tpvo; struct pvo_head *pvo_head; uint64_t pte_lo; int error; @@ -1453,6 +1579,9 @@ VM_OBJECT_ASSERT_LOCKED(m->object); } + if (psind > 0) + return (moea64_sp_enter(pmap, va, m, prot, flags, psind)); + pvo = alloc_pvo_entry(0); if (pvo == NULL) return (KERN_RESOURCE_SHORTAGE); @@ -1476,6 +1605,15 @@ PMAP_LOCK(pmap); if (pvo->pvo_pmap == NULL) init_pvo_entry(pvo, pmap, va); + + tpvo = moea64_pvo_find_va(pmap, va & ~SP_MASK); + if (tpvo && PVO_IS_SP(tpvo)) { + /* Demote SP before entering a regular page */ + CTR2(KTR_PMAP, "%s: demote before enter: va=%#jx", + __func__, (uintmax_t)va); + moea64_sp_demote_aligned(tpvo); + } + if (prot & VM_PROT_WRITE) if (pmap_bootstrapped && (m->oflags & VPO_UNMANAGED) == 0) @@ -1499,6 +1637,7 @@ PV_PAGE_UNLOCK(m); PMAP_UNLOCK(pmap); free_pvo_entry(pvo); + pvo = NULL; goto out; } else { /* Otherwise, need to kill it first */ @@ -1527,6 +1666,21 @@ vm_page_aflag_set(m, PGA_EXECUTABLE); moea64_syncicache(pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE); } + + /* + * Try to promote pages. + * + * If the VA of the entered page is not aligned with its PA, + * don't try page promotion as it is not possible. + * This reduces the number of promotion failures dramatically. + */ + if (moea64_ps_enabled(pmap) && pmap != kernel_pmap && pvo != NULL && + (pvo->pvo_vaddr & PVO_MANAGED) != 0 && + (va & SP_MASK) == (VM_PAGE_TO_PHYS(m) & SP_MASK) && + (m->flags & PG_FICTITIOUS) == 0 && + vm_reserv_level_iffullpop(m) == 0) + moea64_sp_promote(pmap, va, m); + return (KERN_SUCCESS); } @@ -1585,15 +1739,25 @@ { vm_page_t m; vm_pindex_t diff, psize; + vm_offset_t va; + int8_t psind; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { - moea64_enter(pm, start + ptoa(diff), m, prot & - (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP | - PMAP_ENTER_QUICK_LOCKED, 0); + va = start + ptoa(diff); + if ((va & SP_MASK) == 0 && va + SP_SIZE <= end && + m->psind == 1 && moea64_ps_enabled(pm)) + psind = 1; + else + psind = 0; + moea64_enter(pm, va, m, prot & + (VM_PROT_READ | VM_PROT_EXECUTE), + PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, psind); + if (psind == 1) + m = &m[SP_SIZE / PAGE_SIZE - 1]; m = TAILQ_NEXT(m, listq); } } @@ -1707,6 +1871,27 @@ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); + /* + * Are large page mappings enabled? + */ + TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); + if (superpages_enabled) { + KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, + ("moea64_init: can't assign to pagesizes[1]")); + + if (moea64_large_page_size == 0) { + printf("mmu_oea64: HW does not support large pages. " + "Disabling superpages...\n"); + superpages_enabled = 0; + } else if (!moea64_has_lp_4k_16m) { + printf("mmu_oea64: " + "HW does not support mixed 4KB/16MB page sizes. " + "Disabling superpages...\n"); + superpages_enabled = 0; + } else + pagesizes[1] = SP_SIZE; + } + if (!hw_direct_map) { uma_zone_set_allocf(moea64_pvo_zone, moea64_uma_page_alloc); } @@ -1786,7 +1971,7 @@ vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) - return + return; powerpc_sync(); PV_PAGE_LOCK(m); @@ -1796,6 +1981,11 @@ PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { + if (PVO_IS_SP(pvo)) { + CTR1(KTR_PMAP, "%s: demote before remwr", + __func__); + moea64_sp_demote(pvo); + } pvo->pvo_pte.prot &= ~VM_PROT_WRITE; ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE); if (ret < 0) @@ -1844,6 +2034,9 @@ pmap_t pmap; uint64_t lo; + CTR3(KTR_PMAP, "%s: pa=%#jx, ma=%#x", + __func__, (uintmax_t)VM_PAGE_TO_PHYS(m), ma); + if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; @@ -1856,6 +2049,11 @@ pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) { + if (PVO_IS_SP(pvo)) { + CTR1(KTR_PMAP, + "%s: demote before set_memattr", __func__); + moea64_sp_demote(pvo); + } pvo->pvo_pte.pa &= ~LPTE_WIMG; pvo->pvo_pte.pa |= lo; refchg = moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE); @@ -2308,7 +2506,7 @@ moea64_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { - struct pvo_entry *pvo, *tpvo, key; + struct pvo_entry *pvo, key; CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm, sva, eva, prot); @@ -2324,8 +2522,18 @@ PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); - pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { - tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); + pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + if (PVO_IS_SP(pvo)) { + if (moea64_sp_pvo_in_range(pvo, sva, eva)) { + pvo = moea64_sp_protect(pvo, prot); + continue; + } else { + CTR1(KTR_PMAP, "%s: demote before protect", + __func__); + moea64_sp_demote(pvo); + } + } moea64_pvo_protect(pm, pvo, prot); } PMAP_UNLOCK(pm); @@ -2426,13 +2634,46 @@ } } +static void +moea64_remove_locked(pmap_t pm, vm_offset_t sva, vm_offset_t eva, + struct pvo_dlist *tofree) +{ + struct pvo_entry *pvo, *tpvo, key; + + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + key.pvo_vaddr = sva; + for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); + pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { + if (PVO_IS_SP(pvo)) { + if (moea64_sp_pvo_in_range(pvo, sva, eva)) { + tpvo = moea64_sp_remove(pvo, tofree); + continue; + } else { + CTR1(KTR_PMAP, "%s: demote before remove", + __func__); + moea64_sp_demote(pvo); + } + } + tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); + + /* + * For locking reasons, remove this from the page table and + * pmap, but save delinking from the vm_page for a second + * pass + */ + moea64_pvo_remove_from_pmap(pvo); + SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink); + } +} + /* * Remove the given range of addresses from the specified map. */ void moea64_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva) { - struct pvo_entry *pvo, *tpvo, key; + struct pvo_entry *pvo; struct pvo_dlist tofree; /* @@ -2441,23 +2682,9 @@ if (pm->pm_stats.resident_count == 0) return; - key.pvo_vaddr = sva; - SLIST_INIT(&tofree); - PMAP_LOCK(pm); - for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); - pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { - tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); - - /* - * For locking reasons, remove this from the page table and - * pmap, but save delinking from the vm_page for a second - * pass - */ - moea64_pvo_remove_from_pmap(pvo); - SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink); - } + moea64_remove_locked(pm, sva, eva, &tofree); PMAP_UNLOCK(pm); while (!SLIST_EMPTY(&tofree)) { @@ -2487,8 +2714,14 @@ pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); wasdead = (pvo->pvo_vaddr & PVO_DEAD); - if (!wasdead) + if (!wasdead) { + if (PVO_IS_SP(pvo)) { + CTR1(KTR_PMAP, "%s: demote before remove_all", + __func__); + moea64_sp_demote(pvo); + } moea64_pvo_remove_from_pmap(pvo); + } moea64_pvo_remove_from_page_locked(pvo, m); if (!wasdead) LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink); @@ -2721,11 +2954,18 @@ struct pvo_entry *pvo; int64_t ret; boolean_t rv; + vm_page_t sp; /* * See if this bit is stored in the page already. + * + * For superpages, the bit is stored in the first vm page. */ - if (m->md.mdpg_attrs & ptebit) + if ((m->md.mdpg_attrs & ptebit) != 0 || + ((sp = PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(m) & ~SP_MASK)) != NULL && + (sp->md.mdpg_attrs & (ptebit | MDPG_ATTR_SP)) == + (ptebit | MDPG_ATTR_SP)) + ) return (TRUE); /* @@ -2736,6 +2976,21 @@ powerpc_sync(); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + if (PVO_IS_SP(pvo)) { + ret = moea64_sp_query(pvo, ptebit); + /* + * If SP was not demoted, check its REF/CHG bits here. + */ + if (ret != -1) { + if ((ret & ptebit) != 0) { + rv = TRUE; + break; + } + continue; + } + /* else, fallthrough */ + } + ret = 0; /* @@ -2781,6 +3036,12 @@ count = 0; PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + if (PVO_IS_SP(pvo)) { + if ((ret = moea64_sp_clear(pvo, m, ptebit)) != -1) { + count += ret; + continue; + } + } ret = 0; PMAP_LOCK(pvo->pvo_pmap); @@ -3184,3 +3445,758 @@ DEFINE_OEA64_IFUNC(int64_t, pte_clear, (struct pvo_entry *, uint64_t), moea64_null_method) DEFINE_OEA64_IFUNC(int64_t, pte_synch, (struct pvo_entry *), moea64_null_method) +DEFINE_OEA64_IFUNC(int64_t, pte_insert_sp, (struct pvo_entry *), moea64_null_method) +DEFINE_OEA64_IFUNC(int64_t, pte_unset_sp, (struct pvo_entry *), moea64_null_method) +DEFINE_OEA64_IFUNC(int64_t, pte_replace_sp, (struct pvo_entry *), moea64_null_method) + +/* Superpage functions */ + +/* MMU interface */ + +static bool +moea64_ps_enabled(pmap_t pmap) +{ + return (superpages_enabled); +} + +static void +moea64_align_superpage(vm_object_t object, vm_ooffset_t offset, + vm_offset_t *addr, vm_size_t size) +{ + vm_offset_t sp_offset; + + if (size < SP_SIZE) + return; + + CTR4(KTR_PMAP, "%s: offs=%#jx, addr=%p, size=%#jx", + __func__, (uintmax_t)offset, addr, (uintmax_t)size); + + if (object != NULL && (object->flags & OBJ_COLORED) != 0) + offset += ptoa(object->pg_color); + sp_offset = offset & SP_MASK; + if (size - ((SP_SIZE - sp_offset) & SP_MASK) < SP_SIZE || + (*addr & SP_MASK) == sp_offset) + return; + if ((*addr & SP_MASK) < sp_offset) + *addr = (*addr & ~SP_MASK) + sp_offset; + else + *addr = ((*addr + SP_MASK) & ~SP_MASK) + sp_offset; +} + +/* Helpers */ + +static __inline void +moea64_pvo_cleanup(struct pvo_dlist *tofree) +{ + struct pvo_entry *pvo; + + /* clean up */ + while (!SLIST_EMPTY(tofree)) { + pvo = SLIST_FIRST(tofree); + SLIST_REMOVE_HEAD(tofree, pvo_dlink); + if (pvo->pvo_vaddr & PVO_DEAD) + moea64_pvo_remove_from_page(pvo); + free_pvo_entry(pvo); + } +} + +static __inline uint16_t +pvo_to_vmpage_flags(struct pvo_entry *pvo) +{ + uint16_t flags; + + flags = 0; + if ((pvo->pvo_pte.prot & VM_PROT_WRITE) != 0) + flags |= PGA_WRITEABLE; + if ((pvo->pvo_pte.prot & VM_PROT_EXECUTE) != 0) + flags |= PGA_EXECUTABLE; + + return (flags); +} + +/* + * Check if the given pvo and its superpage are in sva-eva range. + */ +static __inline bool +moea64_sp_pvo_in_range(struct pvo_entry *pvo, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t spva; + + spva = PVO_VADDR(pvo) & ~SP_MASK; + if (spva >= sva && spva + SP_SIZE <= eva) { + /* + * Because this function is intended to be called from loops + * that iterate over ordered pvo entries, if the condition + * above is true then the pvo must be the first of its + * superpage. + */ + KASSERT(PVO_VADDR(pvo) == spva, + ("%s: unexpected unaligned superpage pvo", __func__)); + return (true); + } + return (false); +} + +/* + * Update vm about the REF/CHG bits if the superpage is managed and + * has (or had) write access. + */ +static void +moea64_sp_refchg_process(struct pvo_entry *sp, vm_page_t m, + int64_t sp_refchg, vm_prot_t prot) +{ + vm_page_t m_end; + int64_t refchg; + + if ((sp->pvo_vaddr & PVO_MANAGED) != 0 && (prot & VM_PROT_WRITE) != 0) { + for (m_end = &m[SP_PAGES]; m < m_end; m++) { + refchg = sp_refchg | + atomic_readandclear_32(&m->md.mdpg_attrs); + if (refchg & LPTE_CHG) + vm_page_dirty(m); + if (refchg & LPTE_REF) + vm_page_aflag_set(m, PGA_REFERENCED); + } + } +} + +/* Superpage ops */ + +static int +moea64_sp_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, u_int flags, int8_t psind) +{ + struct pvo_entry *pvo, **pvos; + struct pvo_head *pvo_head; + vm_offset_t sva; + vm_page_t sm; + vm_paddr_t pa; + bool sync; + struct pvo_dlist tofree; + int error, i; + uint16_t aflags; + + KASSERT((va & SP_MASK) == 0, ("%s: va %#jx unaligned", + __func__, (uintmax_t)va)); + KASSERT(psind == 1, ("%s: invalid psind: %d", __func__, psind)); + KASSERT(m->psind == 1, ("%s: invalid m->psind: %d", + __func__, m->psind)); + KASSERT(pmap != kernel_pmap, + ("%s: function called with kernel pmap", __func__)); + + CTR5(KTR_PMAP, "%s: va=%#jx, pa=%#jx, prot=%#x, flags=%#x, psind=1", + __func__, (uintmax_t)va, (uintmax_t)VM_PAGE_TO_PHYS(m), + prot, flags); + + SLIST_INIT(&tofree); + + sva = va; + sm = m; + pa = VM_PAGE_TO_PHYS(sm); + + /* Try to allocate all PVOs first, to make failure handling easier. */ + pvos = malloc(SP_PAGES * sizeof(struct pvo_entry *), M_TEMP, M_NOWAIT); + if (pvos == NULL) { + CTR1(KTR_PMAP, "%s: failed to alloc pvo array", __func__); + return (KERN_RESOURCE_SHORTAGE); + } + + for (i = 0; i < SP_PAGES; i++) { + pvos[i] = alloc_pvo_entry(0); + if (pvos[i] == NULL) { + CTR1(KTR_PMAP, "%s: failed to alloc pvo", __func__); + for (i = i - 1; i >= 0; i--) + free_pvo_entry(pvos[i]); + free(pvos, M_TEMP); + return (KERN_RESOURCE_SHORTAGE); + } + } + + PV_PAGE_LOCK(sm); + PMAP_LOCK(pmap); + + /* Note: moea64_remove_locked() also clears cached REF/CHG bits. */ + moea64_remove_locked(pmap, va, va + SP_SIZE, &tofree); + + /* Enter pages */ + for (i = 0; i < SP_PAGES; + i++, va += PAGE_SIZE, pa += PAGE_SIZE, m++) { + pvo = pvos[i]; + + pvo->pvo_pte.prot = prot; + pvo->pvo_pte.pa = (pa & ~LPTE_LP_MASK) | LPTE_LP_4K_16M | + moea64_calc_wimg(pa, pmap_page_get_memattr(m)); + + if ((flags & PMAP_ENTER_WIRED) != 0) + pvo->pvo_vaddr |= PVO_WIRED; + pvo->pvo_vaddr |= PVO_LARGE; + + if ((m->oflags & VPO_UNMANAGED) != 0) + pvo_head = NULL; + else { + pvo_head = &m->md.mdpg_pvoh; + pvo->pvo_vaddr |= PVO_MANAGED; + } + + init_pvo_entry(pvo, pmap, va); + + error = moea64_pvo_enter(pvo, pvo_head, NULL); + /* + * All superpage PVOs were previously removed, so no errors + * should occur while inserting the new ones. + */ + KASSERT(error == 0, ("%s: unexpected error " + "when inserting superpage PVO: %d", + __func__, error)); + } + + PMAP_UNLOCK(pmap); + PV_PAGE_UNLOCK(sm); + + sync = (sm->a.flags & PGA_EXECUTABLE) == 0; + /* Note: moea64_pvo_cleanup() also clears page prot. flags. */ + moea64_pvo_cleanup(&tofree); + pvo = pvos[0]; + + /* Set vm page flags */ + aflags = pvo_to_vmpage_flags(pvo); + if (aflags != 0) + for (m = sm; m < &sm[SP_PAGES]; m++) + vm_page_aflag_set(m, aflags); + + /* + * Flush the page from the instruction cache if this page is + * mapped executable and cacheable. + */ + if (sync && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) + moea64_syncicache(pmap, sva, VM_PAGE_TO_PHYS(sm), SP_SIZE); + + atomic_add_long(&sp_mappings, 1); + CTR3(KTR_PMAP, "%s: SP success for va %#jx in pmap %p", + __func__, (uintmax_t)sva, pmap); + + free(pvos, M_TEMP); + return (KERN_SUCCESS); +} + +static void +moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m) +{ + struct pvo_entry *first, *pvo; + vm_paddr_t pa, pa_end; + vm_offset_t sva, va_end; + int64_t sp_refchg; + + /* This CTR may generate a lot of output. */ + /* CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)va); */ + + va &= ~SP_MASK; + sva = va; + /* Get superpage */ + m = PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(m) & ~SP_MASK); + + PV_PAGE_LOCK(m); + PMAP_LOCK(pmap); + + /* + * Check if all pages meet promotion criteria. + * + * XXX In some cases the loop below may be executed for each or most + * of the entered pages of a superpage, which can be expensive + * (although it was not profiled) and need some optimization. + * + * Some cases where this seems to happen are: + * - When a superpage is first entered read-only and later becomes + * read-write. + * - When some of the superpage's virtual addresses map to previously + * wired/cached pages while others map to pages allocated from a + * different physical address range. A common scenario where this + * happens is when mmap'ing a file that is already present in FS + * block cache and doesn't fill a superpage. + */ + first = pvo = moea64_pvo_find_va(pmap, sva); + for (pa = VM_PAGE_TO_PHYS(m), pa_end = pa + SP_SIZE; + pa < pa_end; pa += PAGE_SIZE, va += PAGE_SIZE) { + if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD) != 0) { + CTR3(KTR_PMAP, + "%s: NULL or dead PVO: pmap=%p, va=%#jx", + __func__, pmap, (uintmax_t)va); + goto error; + } + if (PVO_PADDR(pvo) != pa) { + CTR5(KTR_PMAP, "%s: PAs don't match: " + "pmap=%p, va=%#jx, pvo_pa=%#jx, exp_pa=%#jx", + __func__, pmap, (uintmax_t)va, + (uintmax_t)PVO_PADDR(pvo), (uintmax_t)pa); + atomic_add_long(&sp_p_fail_pa, 1); + goto error; + } + if ((first->pvo_vaddr & PVO_FLAGS_PROMOTE) != + (pvo->pvo_vaddr & PVO_FLAGS_PROMOTE)) { + CTR5(KTR_PMAP, "%s: PVO flags don't match: " + "pmap=%p, va=%#jx, pvo_flags=%#jx, exp_flags=%#jx", + __func__, pmap, (uintmax_t)va, + (uintmax_t)(pvo->pvo_vaddr & PVO_FLAGS_PROMOTE), + (uintmax_t)(first->pvo_vaddr & PVO_FLAGS_PROMOTE)); + atomic_add_long(&sp_p_fail_flags, 1); + goto error; + } + if (first->pvo_pte.prot != pvo->pvo_pte.prot) { + CTR5(KTR_PMAP, "%s: PVO protections don't match: " + "pmap=%p, va=%#jx, pvo_prot=%#x, exp_prot=%#x", + __func__, pmap, (uintmax_t)va, + pvo->pvo_pte.prot, first->pvo_pte.prot); + atomic_add_long(&sp_p_fail_prot, 1); + goto error; + } + if ((first->pvo_pte.pa & LPTE_WIMG) != + (pvo->pvo_pte.pa & LPTE_WIMG)) { + CTR5(KTR_PMAP, "%s: WIMG bits don't match: " + "pmap=%p, va=%#jx, pvo_wimg=%#jx, exp_wimg=%#jx", + __func__, pmap, (uintmax_t)va, + (uintmax_t)(pvo->pvo_pte.pa & LPTE_WIMG), + (uintmax_t)(first->pvo_pte.pa & LPTE_WIMG)); + atomic_add_long(&sp_p_fail_wimg, 1); + goto error; + } + + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo); + } + + /* All OK, promote. */ + + /* + * Handle superpage REF/CHG bits. If REF or CHG is set in + * any page, then it must be set in the superpage. + * + * Instead of querying each page, we take advantage of two facts: + * 1- If a page is being promoted, it was referenced. + * 2- If promoted pages are writable, they were modified. + */ + sp_refchg = LPTE_REF | + ((first->pvo_pte.prot & VM_PROT_WRITE) != 0 ? LPTE_CHG : 0); + + /* Promote pages */ + + for (pvo = first, va_end = PVO_VADDR(pvo) + SP_SIZE; + pvo != NULL && PVO_VADDR(pvo) < va_end; + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) { + pvo->pvo_pte.pa &= ~LPTE_LP_MASK; + pvo->pvo_pte.pa |= LPTE_LP_4K_16M; + pvo->pvo_vaddr |= PVO_LARGE; + } + moea64_pte_replace_sp(first); + + /* Send REF/CHG bits to VM */ + moea64_sp_refchg_process(first, m, sp_refchg, first->pvo_pte.prot); + + /* Use first page to cache REF/CHG bits */ + atomic_set_32(&m->md.mdpg_attrs, sp_refchg | MDPG_ATTR_SP); + + PMAP_UNLOCK(pmap); + PV_PAGE_UNLOCK(m); + + atomic_add_long(&sp_mappings, 1); + atomic_add_long(&sp_promotions, 1); + CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p", + __func__, (uintmax_t)sva, pmap); + return; + +error: + atomic_add_long(&sp_p_failures, 1); + PMAP_UNLOCK(pmap); + PV_PAGE_UNLOCK(m); +} + +static void +moea64_sp_demote_aligned(struct pvo_entry *sp) +{ + struct pvo_entry *pvo; + vm_offset_t va, va_end; + vm_paddr_t pa; + vm_page_t m; + pmap_t pmap; + int64_t refchg; + + CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp)); + + pmap = sp->pvo_pmap; + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + pvo = sp; + + /* Demote pages */ + + va = PVO_VADDR(pvo); + pa = PVO_PADDR(pvo); + m = PHYS_TO_VM_PAGE(pa); + + for (pvo = sp, va_end = va + SP_SIZE; + pvo != NULL && PVO_VADDR(pvo) < va_end; + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo), + va += PAGE_SIZE, pa += PAGE_SIZE) { + KASSERT(pvo && PVO_VADDR(pvo) == va, + ("%s: missing PVO for va %#jx", __func__, (uintmax_t)va)); + + pvo->pvo_vaddr &= ~PVO_LARGE; + pvo->pvo_pte.pa &= ~LPTE_RPGN; + pvo->pvo_pte.pa |= pa; + + } + refchg = moea64_pte_replace_sp(sp); + + /* Clear SP flag */ + atomic_clear_32(&m->md.mdpg_attrs, MDPG_ATTR_SP); + + /* + * Handle superpage REF/CHG bits. A bit set in the superpage + * means all pages should consider it set. + */ + moea64_sp_refchg_process(sp, m, refchg, sp->pvo_pte.prot); + + atomic_add_long(&sp_demotions, 1); + CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p", + __func__, (uintmax_t)PVO_VADDR(sp), pmap); +} + +static void +moea64_sp_demote(struct pvo_entry *pvo) +{ + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + + if ((PVO_VADDR(pvo) & SP_MASK) != 0) { + pvo = moea64_pvo_find_va(pvo->pvo_pmap, + PVO_VADDR(pvo) & ~SP_MASK); + KASSERT(pvo != NULL, ("%s: missing PVO for va %#jx", + __func__, (uintmax_t)(PVO_VADDR(pvo) & ~SP_MASK))); + } + moea64_sp_demote_aligned(pvo); +} + +static struct pvo_entry * +moea64_sp_unwire(struct pvo_entry *sp) +{ + struct pvo_entry *pvo, *prev; + vm_offset_t eva; + pmap_t pm; + int64_t ret, refchg; + + CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp)); + + pm = sp->pvo_pmap; + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + eva = PVO_VADDR(sp) + SP_SIZE; + refchg = 0; + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; + prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + if ((pvo->pvo_vaddr & PVO_WIRED) == 0) + panic("%s: pvo %p is missing PVO_WIRED", + __func__, pvo); + pvo->pvo_vaddr &= ~PVO_WIRED; + + ret = moea64_pte_replace(pvo, 0 /* No invalidation */); + if (ret < 0) + refchg |= LPTE_CHG; + else + refchg |= ret; + + pm->pm_stats.wired_count--; + } + + /* Send REF/CHG bits to VM */ + moea64_sp_refchg_process(sp, PHYS_TO_VM_PAGE(PVO_PADDR(sp)), + refchg, sp->pvo_pte.prot); + + return (prev); +} + +static struct pvo_entry * +moea64_sp_protect(struct pvo_entry *sp, vm_prot_t prot) +{ + struct pvo_entry *pvo, *prev; + vm_offset_t eva; + pmap_t pm; + vm_page_t m, m_end; + int64_t ret, refchg; + vm_prot_t oldprot; + + CTR3(KTR_PMAP, "%s: va=%#jx, prot=%x", + __func__, (uintmax_t)PVO_VADDR(sp), prot); + + pm = sp->pvo_pmap; + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + oldprot = sp->pvo_pte.prot; + m = PHYS_TO_VM_PAGE(PVO_PADDR(sp)); + KASSERT(m != NULL, ("%s: missing vm page for pa %#jx", + __func__, (uintmax_t)PVO_PADDR(sp))); + eva = PVO_VADDR(sp) + SP_SIZE; + refchg = 0; + + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; + prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + pvo->pvo_pte.prot = prot; + /* + * If the PVO is in the page table, update mapping + */ + ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE); + if (ret < 0) + refchg |= LPTE_CHG; + else + refchg |= ret; + } + + /* Send REF/CHG bits to VM */ + moea64_sp_refchg_process(sp, m, refchg, oldprot); + + /* Handle pages that became executable */ + if ((m->a.flags & PGA_EXECUTABLE) == 0 && + (sp->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { + if ((m->oflags & VPO_UNMANAGED) == 0) + for (m_end = &m[SP_PAGES]; m < m_end; m++) + vm_page_aflag_set(m, PGA_EXECUTABLE); + moea64_syncicache(pm, PVO_VADDR(sp), PVO_PADDR(sp), SP_SIZE); + } + + return (prev); +} + +static struct pvo_entry * +moea64_sp_remove(struct pvo_entry *sp, struct pvo_dlist *tofree) +{ + struct pvo_entry *pvo, *tpvo; + vm_offset_t eva; + pmap_t pm; + + CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp)); + + pm = sp->pvo_pmap; + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + eva = PVO_VADDR(sp) + SP_SIZE; + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { + tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); + + /* + * For locking reasons, remove this from the page table and + * pmap, but save delinking from the vm_page for a second + * pass + */ + moea64_pvo_remove_from_pmap(pvo); + SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink); + } + + /* Clear SP bit */ + atomic_clear_32(&PHYS_TO_VM_PAGE(PVO_PADDR(sp))->md.mdpg_attrs, + MDPG_ATTR_SP); + + return (tpvo); +} + +static int64_t +moea64_sp_query_locked(struct pvo_entry *pvo, uint64_t ptebit) +{ + int64_t refchg, ret; + vm_offset_t eva; + vm_page_t m; + pmap_t pmap; + struct pvo_entry *sp; + + pmap = pvo->pvo_pmap; + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* Get first SP PVO */ + if ((PVO_VADDR(pvo) & SP_MASK) != 0) { + sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~SP_MASK); + KASSERT(sp != NULL, ("%s: missing PVO for va %#jx", + __func__, (uintmax_t)(PVO_VADDR(pvo) & ~SP_MASK))); + } else + sp = pvo; + eva = PVO_VADDR(sp) + SP_SIZE; + + refchg = 0; + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) { + ret = moea64_pte_synch(pvo); + if (ret > 0) { + refchg |= ret & (LPTE_CHG | LPTE_REF); + if ((refchg & ptebit) != 0) + break; + } + } + + /* Save results */ + if (refchg != 0) { + m = PHYS_TO_VM_PAGE(PVO_PADDR(sp)); + atomic_set_32(&m->md.mdpg_attrs, refchg | MDPG_ATTR_SP); + } + + return (refchg); +} + +static int64_t +moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit) +{ + int64_t refchg; + pmap_t pmap; + + pmap = pvo->pvo_pmap; + PMAP_LOCK(pmap); + + /* + * Check if SP was demoted/removed before pmap lock was acquired. + */ + if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) { + CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx", + __func__, (uintmax_t)PVO_PADDR(pvo)); + PMAP_UNLOCK(pmap); + return (-1); + } + + refchg = moea64_sp_query_locked(pvo, ptebit); + PMAP_UNLOCK(pmap); + + CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx", + __func__, (uintmax_t)PVO_VADDR(pvo), + (uintmax_t)PVO_PADDR(pvo), (uintmax_t)refchg); + + return (refchg); +} + +static int64_t +moea64_sp_pvo_clear(struct pvo_entry *pvo, uint64_t ptebit) +{ + int64_t refchg, ret; + pmap_t pmap; + struct pvo_entry *sp; + vm_offset_t eva; + vm_page_t m; + + pmap = pvo->pvo_pmap; + PMAP_LOCK(pmap); + + /* + * Check if SP was demoted/removed before pmap lock was acquired. + */ + if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) { + CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx", + __func__, (uintmax_t)PVO_PADDR(pvo)); + PMAP_UNLOCK(pmap); + return (-1); + } + + /* Get first SP PVO */ + if ((PVO_VADDR(pvo) & SP_MASK) != 0) { + sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~SP_MASK); + KASSERT(sp != NULL, ("%s: missing PVO for va %#jx", + __func__, (uintmax_t)(PVO_VADDR(pvo) & ~SP_MASK))); + } else + sp = pvo; + eva = PVO_VADDR(sp) + SP_SIZE; + + refchg = 0; + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) { + ret = moea64_pte_clear(pvo, ptebit); + if (ret > 0) + refchg |= ret & (LPTE_CHG | LPTE_REF); + } + + m = PHYS_TO_VM_PAGE(PVO_PADDR(sp)); + atomic_clear_32(&m->md.mdpg_attrs, ptebit); + PMAP_UNLOCK(pmap); + + CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx", + __func__, (uintmax_t)PVO_VADDR(sp), + (uintmax_t)PVO_PADDR(sp), (uintmax_t)refchg); + + return (refchg); +} + +static int64_t +moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m, uint64_t ptebit) +{ + int64_t count, ret; + pmap_t pmap; + + count = 0; + pmap = pvo->pvo_pmap; + + /* + * Since this reference bit is shared by 4096 4KB pages, it + * should not be cleared every time it is tested. Apply a + * simple "hash" function on the physical page number, the + * virtual superpage number, and the pmap address to select + * one 4KB page out of the 4096 on which testing the + * reference bit will result in clearing that reference bit. + * This function is designed to avoid the selection of the + * same 4KB page for every 16MB page mapping. + * + * Always leave the reference bit of a wired mapping set, as + * the current state of its reference bit won't affect page + * replacement. + */ + if (ptebit == LPTE_REF && (((VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) ^ + (PVO_VADDR(pvo) >> SP_SHIFT) ^ (uintptr_t)pmap) & + (SP_PAGES - 1)) == 0 && (pvo->pvo_vaddr & PVO_WIRED) == 0) { + if ((ret = moea64_sp_pvo_clear(pvo, ptebit)) == -1) + return (-1); + + if ((ret & ptebit) != 0) + count++; + + /* + * If this page was not selected by the hash function, then assume + * its REF bit was set. + */ + } else if (ptebit == LPTE_REF) { + count++; + + /* + * To clear the CHG bit of a single SP page, first it must be demoted. + * But if no CHG bit is set, no bit clear and thus no SP demotion is + * needed. + */ + } else { + CTR4(KTR_PMAP, "%s: ptebit=%#jx, va=%#jx, pa=%#jx", + __func__, (uintmax_t)ptebit, (uintmax_t)PVO_VADDR(pvo), + (uintmax_t)PVO_PADDR(pvo)); + + PMAP_LOCK(pmap); + + /* + * Make sure SP wasn't demoted/removed before pmap lock + * was acquired. + */ + if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) { + CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx", + __func__, (uintmax_t)PVO_PADDR(pvo)); + PMAP_UNLOCK(pmap); + return (-1); + } + + ret = moea64_sp_query_locked(pvo, ptebit); + if ((ret & ptebit) != 0) + count++; + else { + PMAP_UNLOCK(pmap); + return (0); + } + + moea64_sp_demote(pvo); + moea64_pte_clear(pvo, ptebit); + + /* + * Write protect the mapping to a single page so that a + * subsequent write access may repromote. + */ + if ((pvo->pvo_vaddr & PVO_WIRED) == 0) + moea64_pvo_protect(pmap, pvo, + pvo->pvo_pte.prot & ~VM_PROT_WRITE); + + PMAP_UNLOCK(pmap); + } + + return (count); +} Index: sys/powerpc/aim/moea64_native.c =================================================================== --- sys/powerpc/aim/moea64_native.c +++ sys/powerpc/aim/moea64_native.c @@ -132,11 +132,32 @@ /* POWER9 only permits a 64k partition table size. */ #define PART_SIZE 0x10000 +/* Actual page sizes (to be used with tlbie, when L=0) */ +#define AP_4K 0x00 +#define AP_16M 0x80 + +#define LPTE_KERNEL_VSID_BIT (KERNEL_VSID_BIT << \ + (16 - (ADDR_API_SHFT64 - ADDR_PIDX_SHFT))) + +/* Abbreviated Virtual Address Page - high bits */ +#define LPTE_AVA_PGNHI_MASK 0x0000000000000F80ULL +#define LPTE_AVA_PGNHI_SHIFT 7 + +/* Effective Address Page - low bits */ +#define EA_PAGELO_MASK 0x7ffULL +#define EA_PAGELO_SHIFT 11 + static bool moea64_crop_tlbie; static bool moea64_need_lock; +/* + * The tlbie instruction has two forms: an old one used by PowerISA + * 2.03 and prior, and a newer one used by PowerISA 2.06 and later. + * We need to support both. + */ static __inline void -TLBIE(uint64_t vpn) { +TLBIE(uint64_t vpn, uint64_t oldptehi) +{ #ifndef __powerpc64__ register_t vpn_hi, vpn_lo; register_t msr; @@ -153,22 +174,32 @@ while (!atomic_cmpset_int(&tlbie_lock, 0, 1)); isync(); /* Flush instruction queue once lock acquired */ - if (moea64_crop_tlbie) + if (moea64_crop_tlbie) { vpn &= ~(0xffffULL << 48); +#ifdef __powerpc64__ + if ((oldptehi & LPTE_BIG) != 0) + __asm __volatile("tlbie %0, 1" :: "r"(vpn) : + "memory"); + else + __asm __volatile("tlbie %0, 0" :: "r"(vpn) : + "memory"); + __asm __volatile("eieio; tlbsync; ptesync" ::: + "memory"); + goto done; +#endif + } } #ifdef __powerpc64__ /* - * Explicitly clobber r0. The tlbie instruction has two forms: an old - * one used by PowerISA 2.03 and prior, and a newer one used by PowerISA - * 2.06 (maybe 2.05?) and later. We need to support both, and it just - * so happens that since we use 4k pages we can simply zero out r0, and - * clobber it, and the assembler will interpret the single-operand form - * of tlbie as having RB set, and everything else as 0. The RS operand - * in the newer form is in the same position as the L(page size) bit of - * the old form, so a slong as RS is 0, we're good on both sides. + * If this page has LPTE_BIG set and is from userspace, then + * it must be a superpage with 4KB base/16MB actual page size. */ - __asm __volatile("li 0, 0 \n tlbie %0" :: "r"(vpn) : "r0", "memory"); + if ((oldptehi & LPTE_BIG) != 0 && + (oldptehi & LPTE_KERNEL_VSID_BIT) == 0) + vpn |= AP_16M; + + __asm __volatile("li 0, 0 \n tlbie %0, 0" :: "r"(vpn) : "r0", "memory"); __asm __volatile("eieio; tlbsync; ptesync" ::: "memory"); #else vpn_hi = (uint32_t)(vpn >> 32); @@ -194,6 +225,7 @@ intr_restore(intr); #endif +done: /* No barriers or special ops -- taken care of by ptesync above */ if (need_lock) tlbie_lock = 0; @@ -224,6 +256,9 @@ static int64_t moea64_pte_clear_native(struct pvo_entry *, uint64_t); static int64_t moea64_pte_replace_native(struct pvo_entry *, int); static int64_t moea64_pte_unset_native(struct pvo_entry *); +static int64_t moea64_pte_insert_sp_native(struct pvo_entry *); +static int64_t moea64_pte_unset_sp_native(struct pvo_entry *); +static int64_t moea64_pte_replace_sp_native(struct pvo_entry *); /* * Utility routines. @@ -245,10 +280,13 @@ static struct moea64_funcs moea64_native_funcs = { .pte_synch = moea64_pte_synch_native, - .pte_clear = moea64_pte_clear_native, - .pte_unset = moea64_pte_unset_native, - .pte_replace = moea64_pte_replace_native, - .pte_insert = moea64_pte_insert_native, + .pte_clear = moea64_pte_clear_native, + .pte_unset = moea64_pte_unset_native, + .pte_replace = moea64_pte_replace_native, + .pte_insert = moea64_pte_insert_native, + .pte_insert_sp = moea64_pte_insert_sp_native, + .pte_unset_sp = moea64_pte_unset_sp_native, + .pte_replace_sp = moea64_pte_replace_sp_native, }; MMU_DEF_INHERIT(oea64_mmu_native, MMU_TYPE_G5, moea64_native_methods, oea64_mmu); @@ -321,7 +359,7 @@ rw_runlock(&moea64_eviction_lock); critical_enter(); - TLBIE(pvo->pvo_vpn); + TLBIE(pvo->pvo_vpn, properpt.pte_hi); critical_exit(); } else { rw_runlock(&moea64_eviction_lock); @@ -332,21 +370,10 @@ return (ptelo & (LPTE_REF | LPTE_CHG)); } -static int64_t -moea64_pte_unset_native(struct pvo_entry *pvo) +static __inline int64_t +moea64_pte_unset_locked(volatile struct lpte *pt, uint64_t vpn) { - volatile struct lpte *pt = moea64_pteg_table + pvo->pvo_pte.slot; - uint64_t ptelo, pvo_ptevpn; - - pvo_ptevpn = moea64_pte_vpn_from_pvo_vpn(pvo); - - rw_rlock(&moea64_eviction_lock); - if ((be64toh(pt->pte_hi & LPTE_AVPN_MASK)) != pvo_ptevpn) { - /* Evicted */ - STAT_MOEA64(moea64_pte_overflow--); - rw_runlock(&moea64_eviction_lock); - return (-1); - } + uint64_t ptelo; /* * Invalidate the pte, briefly locking it to collect RC bits. No @@ -356,11 +383,10 @@ critical_enter(); pt->pte_hi = be64toh((pt->pte_hi & ~LPTE_VALID) | LPTE_LOCKED); PTESYNC(); - TLBIE(pvo->pvo_vpn); + TLBIE(vpn, pt->pte_hi); ptelo = be64toh(pt->pte_lo); *((volatile int32_t *)(&pt->pte_hi) + 1) = 0; /* Release lock */ critical_exit(); - rw_runlock(&moea64_eviction_lock); /* Keep statistics */ STAT_MOEA64(moea64_pte_valid--); @@ -368,6 +394,29 @@ return (ptelo & (LPTE_CHG | LPTE_REF)); } +static int64_t +moea64_pte_unset_native(struct pvo_entry *pvo) +{ + volatile struct lpte *pt = moea64_pteg_table + pvo->pvo_pte.slot; + int64_t ret; + uint64_t pvo_ptevpn; + + pvo_ptevpn = moea64_pte_vpn_from_pvo_vpn(pvo); + + rw_rlock(&moea64_eviction_lock); + + if ((be64toh(pt->pte_hi & LPTE_AVPN_MASK)) != pvo_ptevpn) { + /* Evicted */ + STAT_MOEA64(moea64_pte_overflow--); + ret = -1; + } else + ret = moea64_pte_unset_locked(pt, pvo->pvo_vpn); + + rw_runlock(&moea64_eviction_lock); + + return (ret); +} + static int64_t moea64_pte_replace_inval_native(struct pvo_entry *pvo, volatile struct lpte *pt) @@ -394,7 +443,7 @@ critical_enter(); pt->pte_hi = be64toh((pt->pte_hi & ~LPTE_VALID) | LPTE_LOCKED); PTESYNC(); - TLBIE(pvo->pvo_vpn); + TLBIE(pvo->pvo_vpn, pt->pte_hi); ptelo = be64toh(pt->pte_lo); EIEIO(); pt->pte_lo = htobe64(properpt.pte_lo); @@ -702,7 +751,7 @@ va |= (oldptehi & LPTE_AVPN_MASK) << (ADDR_API_SHFT64 - ADDR_PIDX_SHFT); PTESYNC(); - TLBIE(va); + TLBIE(va, oldptehi); STAT_MOEA64(moea64_pte_valid--); STAT_MOEA64(moea64_pte_overflow++); } @@ -722,26 +771,18 @@ return (k); } -static int64_t -moea64_pte_insert_native(struct pvo_entry *pvo) +static __inline int64_t +moea64_pte_insert_locked(struct pvo_entry *pvo, struct lpte *insertpt, + uint64_t mask) { - struct lpte insertpt; uintptr_t slot; - /* Initialize PTE */ - moea64_pte_from_pvo(pvo, &insertpt); - - /* Make sure further insertion is locked out during evictions */ - rw_rlock(&moea64_eviction_lock); - /* * First try primary hash. */ - pvo->pvo_pte.slot &= ~7ULL; /* Base slot address */ - slot = moea64_insert_to_pteg_native(&insertpt, pvo->pvo_pte.slot, - LPTE_VALID | LPTE_WIRED | LPTE_LOCKED); + slot = moea64_insert_to_pteg_native(insertpt, pvo->pvo_pte.slot, + mask | LPTE_WIRED | LPTE_LOCKED); if (slot != -1) { - rw_runlock(&moea64_eviction_lock); pvo->pvo_pte.slot = slot; return (0); } @@ -750,50 +791,52 @@ * Now try secondary hash. */ pvo->pvo_vaddr ^= PVO_HID; - insertpt.pte_hi ^= LPTE_HID; + insertpt->pte_hi ^= LPTE_HID; pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); - slot = moea64_insert_to_pteg_native(&insertpt, pvo->pvo_pte.slot, - LPTE_VALID | LPTE_WIRED | LPTE_LOCKED); + slot = moea64_insert_to_pteg_native(insertpt, pvo->pvo_pte.slot, + mask | LPTE_WIRED | LPTE_LOCKED); if (slot != -1) { - rw_runlock(&moea64_eviction_lock); pvo->pvo_pte.slot = slot; return (0); } - /* - * Out of luck. Find a PTE to sacrifice. - */ + return (-1); +} - /* Lock out all insertions for a bit */ - if (!rw_try_upgrade(&moea64_eviction_lock)) { - rw_runlock(&moea64_eviction_lock); - rw_wlock(&moea64_eviction_lock); - } +static int64_t +moea64_pte_insert_native(struct pvo_entry *pvo) +{ + struct lpte insertpt; + int64_t ret; - slot = moea64_insert_to_pteg_native(&insertpt, pvo->pvo_pte.slot, - LPTE_WIRED | LPTE_LOCKED); - if (slot != -1) { - rw_wunlock(&moea64_eviction_lock); - pvo->pvo_pte.slot = slot; - return (0); - } + /* Initialize PTE */ + moea64_pte_from_pvo(pvo, &insertpt); - /* Try other hash table. Now we're getting desperate... */ - pvo->pvo_vaddr ^= PVO_HID; - insertpt.pte_hi ^= LPTE_HID; - pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); - slot = moea64_insert_to_pteg_native(&insertpt, pvo->pvo_pte.slot, - LPTE_WIRED | LPTE_LOCKED); - if (slot != -1) { + /* Make sure further insertion is locked out during evictions */ + rw_rlock(&moea64_eviction_lock); + + pvo->pvo_pte.slot &= ~7ULL; /* Base slot address */ + ret = moea64_pte_insert_locked(pvo, &insertpt, LPTE_VALID); + if (ret == -1) { + /* + * Out of luck. Find a PTE to sacrifice. + */ + + /* Lock out all insertions for a bit */ + if (!rw_try_upgrade(&moea64_eviction_lock)) { + rw_runlock(&moea64_eviction_lock); + rw_wlock(&moea64_eviction_lock); + } + /* Don't evict large pages */ + ret = moea64_pte_insert_locked(pvo, &insertpt, LPTE_BIG); rw_wunlock(&moea64_eviction_lock); - pvo->pvo_pte.slot = slot; - return (0); - } + /* No freeable slots in either PTEG? We're hosed. */ + if (ret == -1) + panic("moea64_pte_insert: overflow"); + } else + rw_runlock(&moea64_eviction_lock); - /* No freeable slots in either PTEG? We're hosed. */ - rw_wunlock(&moea64_eviction_lock); - panic("moea64_pte_insert: overflow"); - return (-1); + return (0); } static void * @@ -814,3 +857,134 @@ dctx->ptex = ptex_end; return (__DEVOLATILE(struct lpte *, moea64_pteg_table) + ptex); } + +static __inline uint64_t +moea64_vpn_from_pte(uint64_t ptehi, uintptr_t slot) +{ + uint64_t pgn, pgnlo, vsid; + + vsid = (ptehi & LPTE_AVA_MASK) >> LPTE_VSID_SHIFT; + if ((ptehi & LPTE_HID) != 0) + slot ^= (moea64_pteg_mask << 3); + pgnlo = ((vsid & VSID_HASH_MASK) ^ (slot >> 3)) & EA_PAGELO_MASK; + pgn = ((ptehi & LPTE_AVA_PGNHI_MASK) << (EA_PAGELO_SHIFT - + LPTE_AVA_PGNHI_SHIFT)) | pgnlo; + return ((vsid << 16) | pgn); +} + +static __inline int64_t +moea64_pte_unset_sp_locked(struct pvo_entry *pvo) +{ + volatile struct lpte *pt; + uint64_t ptehi, refchg, vpn; + vm_offset_t eva; + pmap_t pm; + + pm = pvo->pvo_pmap; + refchg = 0; + eva = PVO_VADDR(pvo) + SP_SIZE; + + for (; pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + pt = moea64_pteg_table + pvo->pvo_pte.slot; + ptehi = be64toh(pt->pte_hi); + if ((ptehi & LPTE_AVPN_MASK) != + moea64_pte_vpn_from_pvo_vpn(pvo)) { + /* Evicted: invalidate new entry */ + STAT_MOEA64(moea64_pte_overflow--); + vpn = moea64_vpn_from_pte(ptehi, pvo->pvo_pte.slot); + CTR1(KTR_PMAP, "Evicted page in pte_unset_sp: vpn=%jx", + (uintmax_t)vpn); + /* Assume evicted page was modified */ + refchg |= LPTE_CHG; + } else + vpn = pvo->pvo_vpn; + + refchg |= moea64_pte_unset_locked(pt, vpn); + } + + return (refchg); +} + +static int64_t +moea64_pte_unset_sp_native(struct pvo_entry *pvo) +{ + uint64_t refchg; + + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + KASSERT((PVO_VADDR(pvo) & SP_MASK) == 0, + ("%s: va %#jx unaligned", __func__, (uintmax_t)PVO_VADDR(pvo))); + + rw_rlock(&moea64_eviction_lock); + refchg = moea64_pte_unset_sp_locked(pvo); + rw_runlock(&moea64_eviction_lock); + + return (refchg); +} + +static __inline int64_t +moea64_pte_insert_sp_locked(struct pvo_entry *pvo) +{ + struct lpte insertpt; + int64_t ret; + vm_offset_t eva; + pmap_t pm; + + pm = pvo->pvo_pmap; + eva = PVO_VADDR(pvo) + SP_SIZE; + + for (; pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + moea64_pte_from_pvo(pvo, &insertpt); + pvo->pvo_pte.slot &= ~7ULL; /* Base slot address */ + + ret = moea64_pte_insert_locked(pvo, &insertpt, LPTE_VALID); + if (ret == -1) { + /* Lock out all insertions for a bit */ + if (!rw_try_upgrade(&moea64_eviction_lock)) { + rw_runlock(&moea64_eviction_lock); + rw_wlock(&moea64_eviction_lock); + } + /* Don't evict large pages */ + ret = moea64_pte_insert_locked(pvo, &insertpt, + LPTE_BIG); + rw_downgrade(&moea64_eviction_lock); + /* No freeable slots in either PTEG? We're hosed. */ + if (ret == -1) + panic("moea64_pte_insert_sp: overflow"); + } + } + + return (0); +} + +static int64_t +moea64_pte_insert_sp_native(struct pvo_entry *pvo) +{ + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + KASSERT((PVO_VADDR(pvo) & SP_MASK) == 0, + ("%s: va %#jx unaligned", __func__, (uintmax_t)PVO_VADDR(pvo))); + + rw_rlock(&moea64_eviction_lock); + moea64_pte_insert_sp_locked(pvo); + rw_runlock(&moea64_eviction_lock); + + return (0); +} + +static int64_t +moea64_pte_replace_sp_native(struct pvo_entry *pvo) +{ + uint64_t refchg; + + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + KASSERT((PVO_VADDR(pvo) & SP_MASK) == 0, + ("%s: va %#jx unaligned", __func__, (uintmax_t)PVO_VADDR(pvo))); + + rw_rlock(&moea64_eviction_lock); + refchg = moea64_pte_unset_sp_locked(pvo); + moea64_pte_insert_sp_locked(pvo); + rw_runlock(&moea64_eviction_lock); + + return (refchg); +} Index: sys/powerpc/include/param.h =================================================================== --- sys/powerpc/include/param.h +++ sys/powerpc/include/param.h @@ -120,6 +120,15 @@ #define L3_PAGE_SIZE (1UL<pvo_vaddr & ~ADDR_POFF) #define PVO_PTEGIDX_GET(pvo) ((pvo)->pvo_vaddr & PVO_PTEGIDX_MASK) #define PVO_PTEGIDX_ISSET(pvo) ((pvo)->pvo_vaddr & PVO_PTEGIDX_VALID) Index: sys/powerpc/include/pte.h =================================================================== --- sys/powerpc/include/pte.h +++ sys/powerpc/include/pte.h @@ -111,6 +111,7 @@ /* High quadword: */ #define LPTE_VSID_SHIFT 12 #define LPTE_AVPN_MASK 0xFFFFFFFFFFFFFF80ULL +#define LPTE_AVA_MASK 0x3FFFFFFFFFFFFF80ULL #define LPTE_API 0x0000000000000F80ULL #define LPTE_SWBITS 0x0000000000000078ULL #define LPTE_WIRED 0x0000000000000010ULL @@ -120,8 +121,13 @@ #define LPTE_VALID 0x0000000000000001ULL /* Low quadword: */ +#define LP_4K_16M 0x38 /* 4KB base, 16MB actual page size */ + #define EXTEND_PTE(x) UINT64_C(x) /* make constants 64-bit */ #define LPTE_RPGN 0xfffffffffffff000ULL +#define LPTE_LP_MASK 0x00000000000ff000ULL +#define LPTE_LP_SHIFT 12 +#define LPTE_LP_4K_16M ((unsigned long long)(LP_4K_16M) << LPTE_LP_SHIFT) #define LPTE_REF EXTEND_PTE( PTE_REF ) #define LPTE_CHG EXTEND_PTE( PTE_CHG ) #define LPTE_WIMG EXTEND_PTE( PTE_WIMG ) @@ -139,6 +145,12 @@ #define LPTE_RW LPTE_BW #define LPTE_RO LPTE_BR +/* HPT superpage definitions */ +#define SP_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT) +#define SP_SIZE (1 << SP_SHIFT) +#define SP_MASK (SP_SIZE - 1) +#define SP_PAGES (1 << VM_LEVEL_0_ORDER) + /* POWER ISA 3.0 Radix Table Definitions */ #define RPTE_VALID 0x8000000000000000ULL #define RPTE_LEAF 0x4000000000000000ULL /* is a PTE: always 1 */ Index: sys/powerpc/include/slb.h =================================================================== --- sys/powerpc/include/slb.h +++ sys/powerpc/include/slb.h @@ -64,6 +64,14 @@ #define SLBE_ESID_MASK 0xfffffffff0000000UL /* Effective segment ID mask */ #define SLBE_ESID_SHIFT 28 +/* + * SLB page sizes encoding, as present in property ibm,segment-page-sizes + * of CPU device tree node. + * + * See LoPAPR: CPU Node Properties, section C.6.1.4. + */ +#define SLB_PGSZ_4K_4K 0 + /* Virtual real-mode VSID in LPARs */ #define VSID_VRMA 0x1ffffff Index: sys/powerpc/include/vmparam.h =================================================================== --- sys/powerpc/include/vmparam.h +++ sys/powerpc/include/vmparam.h @@ -185,31 +185,34 @@ #define VM_NFREELIST 1 #define VM_FREELIST_DEFAULT 0 -/* - * The largest allocation size is 4MB. - */ #ifdef __powerpc64__ +/* The largest allocation size is 16MB. */ #define VM_NFREEORDER 13 #else +/* The largest allocation size is 4MB. */ #define VM_NFREEORDER 11 #endif #ifndef VM_NRESERVLEVEL #ifdef __powerpc64__ +/* Enable superpage reservations: 1 level. */ #define VM_NRESERVLEVEL 1 #else -/* - * Disable superpage reservations. - */ +/* Disable superpage reservations. */ #define VM_NRESERVLEVEL 0 #endif #endif -/* - * Level 0 reservations consist of 512 pages. - */ #ifndef VM_LEVEL_0_ORDER -#define VM_LEVEL_0_ORDER 9 +/* Level 0 reservations consist of 512 (RPT) or 4096 (HPT) pages. */ +#define VM_LEVEL_0_ORDER vm_level_0_order +#ifndef __ASSEMBLER__ +extern int vm_level_0_order; +#endif +#endif + +#ifndef VM_LEVEL_0_ORDER_MAX +#define VM_LEVEL_0_ORDER_MAX 12 #endif #ifdef __powerpc64__ Index: sys/powerpc/powernv/platform_powernv.c =================================================================== --- sys/powerpc/powernv/platform_powernv.c +++ sys/powerpc/powernv/platform_powernv.c @@ -142,6 +142,7 @@ phandle_t opal; int res, len, idx; register_t msr; + bool has_lp; /* Ping OPAL again just to make sure */ opal_check(); @@ -225,6 +226,7 @@ sizeof(arr)); len /= 4; idx = 0; + has_lp = false; while (len > 0) { shift = arr[idx]; slb_encoding = arr[idx + 1]; @@ -235,17 +237,21 @@ lp_size = arr[idx]; lp_encoding = arr[idx+1]; if (slb_encoding == SLBV_L && lp_encoding == 0) - break; + has_lp = true; + + if (slb_encoding == SLB_PGSZ_4K_4K && + lp_encoding == LP_4K_16M) + moea64_has_lp_4k_16m = true; idx += 2; len -= 2; nptlp--; } - if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0) + if (has_lp && moea64_has_lp_4k_16m) break; } - if (len == 0) + if (!has_lp) panic("Standard large pages (SLB[L] = 1, PTE[LP] = 0) " "not supported by this system."); Index: sys/powerpc/powerpc/pmap_dispatch.c =================================================================== --- sys/powerpc/powerpc/pmap_dispatch.c +++ sys/powerpc/powerpc/pmap_dispatch.c @@ -77,6 +77,8 @@ caddr_t crashdumpmap; int pmap_bootstrapped; +/* Default level 0 reservations consist of 512 pages (2MB superpage). */ +int vm_level_0_order = 9; #ifdef AIM int Index: sys/powerpc/pseries/mmu_phyp.c =================================================================== --- sys/powerpc/pseries/mmu_phyp.c +++ sys/powerpc/pseries/mmu_phyp.c @@ -82,6 +82,9 @@ static int64_t mphyp_pte_clear(struct pvo_entry *pvo, uint64_t ptebit); static int64_t mphyp_pte_unset(struct pvo_entry *pvo); static int64_t mphyp_pte_insert(struct pvo_entry *pvo); +static int64_t mphyp_pte_unset_sp(struct pvo_entry *pvo); +static int64_t mphyp_pte_insert_sp(struct pvo_entry *pvo); +static int64_t mphyp_pte_replace_sp(struct pvo_entry *pvo); static struct pmap_funcs mphyp_methods = { .install = mphyp_install, @@ -95,6 +98,9 @@ .pte_clear = mphyp_pte_clear, .pte_unset = mphyp_pte_unset, .pte_insert = mphyp_pte_insert, + .pte_unset_sp = mphyp_pte_unset_sp, + .pte_insert_sp = mphyp_pte_insert_sp, + .pte_replace_sp = mphyp_pte_replace_sp, }; MMU_DEF_INHERIT(pseries_mmu, "mmu_phyp", mphyp_methods, oea64_mmu); @@ -135,6 +141,7 @@ uint64_t vsid; phandle_t dev, node, root; int idx, len, res; + bool has_lp; rm_init(&mphyp_eviction_lock, "pte eviction"); @@ -199,6 +206,7 @@ sizeof(arr)); len /= 4; idx = 0; + has_lp = false; while (len > 0) { shift = arr[idx]; slb_encoding = arr[idx + 1]; @@ -220,18 +228,22 @@ lp_encoding); if (slb_encoding == SLBV_L && lp_encoding == 0) - break; + has_lp = true; + + if (slb_encoding == SLB_PGSZ_4K_4K && + lp_encoding == LP_4K_16M) + moea64_has_lp_4k_16m = true; idx += 2; len -= 2; nptlp--; } dprintf("\n"); - if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0) + if (has_lp && moea64_has_lp_4k_16m) break; } - if (len > 0) { + if (has_lp) { moea64_large_page_shift = shift; moea64_large_page_size = 1ULL << lp_size; moea64_large_page_mask = moea64_large_page_size - 1; @@ -393,7 +405,7 @@ phyp_pft_hcall(H_READ, 0, slot, 0, 0, &pt.pte_hi, &pt.pte_lo, &junk); - if (pt.pte_hi & LPTE_WIRED) + if ((pt.pte_hi & (LPTE_WIRED | LPTE_BIG)) != 0) continue; /* This is a candidate, so remember it */ @@ -414,68 +426,61 @@ return (k); } -static int64_t -mphyp_pte_insert(struct pvo_entry *pvo) +static __inline int64_t +mphyp_pte_insert_locked(struct pvo_entry *pvo, struct lpte *pte) { - struct rm_priotracker track; + struct lpte evicted; + uint64_t index, junk; int64_t result; - struct lpte evicted, pte; - uint64_t index, junk, lastptelo; - - PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); - - /* Initialize PTE */ - moea64_pte_from_pvo(pvo, &pte); - evicted.pte_hi = 0; - - /* Make sure further insertion is locked out during evictions */ - rm_rlock(&mphyp_eviction_lock, &track); /* * First try primary hash. */ pvo->pvo_pte.slot &= ~7UL; /* Base slot address */ - result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, pte.pte_hi, - pte.pte_lo, &index, &evicted.pte_lo, &junk); + result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, pte->pte_hi, + pte->pte_lo, &index, &evicted.pte_lo, &junk); if (result == H_SUCCESS) { - rm_runlock(&mphyp_eviction_lock, &track); pvo->pvo_pte.slot = index; return (0); } KASSERT(result == H_PTEG_FULL, ("Page insertion error: %ld " "(ptegidx: %#zx/%#lx, PTE %#lx/%#lx", result, pvo->pvo_pte.slot, - moea64_pteg_count, pte.pte_hi, pte.pte_lo)); + moea64_pteg_count, pte->pte_hi, pte->pte_lo)); /* * Next try secondary hash. */ pvo->pvo_vaddr ^= PVO_HID; - pte.pte_hi ^= LPTE_HID; + pte->pte_hi ^= LPTE_HID; pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, - pte.pte_hi, pte.pte_lo, &index, &evicted.pte_lo, &junk); + pte->pte_hi, pte->pte_lo, &index, &evicted.pte_lo, &junk); if (result == H_SUCCESS) { - rm_runlock(&mphyp_eviction_lock, &track); pvo->pvo_pte.slot = index; return (0); } KASSERT(result == H_PTEG_FULL, ("Secondary page insertion error: %ld", result)); - /* - * Out of luck. Find a PTE to sacrifice. - */ + return (-1); +} - /* Lock out all insertions for a bit */ - rm_runlock(&mphyp_eviction_lock, &track); - rm_wlock(&mphyp_eviction_lock); + +static __inline int64_t +mphyp_pte_evict_and_insert_locked(struct pvo_entry *pvo, struct lpte *pte) +{ + struct lpte evicted; + uint64_t index, junk, lastptelo; + int64_t result; + + evicted.pte_hi = 0; index = mphyp_pte_spillable_ident(pvo->pvo_pte.slot, &evicted); if (index == -1L) { /* Try other hash table? */ pvo->pvo_vaddr ^= PVO_HID; - pte.pte_hi ^= LPTE_HID; + pte->pte_hi ^= LPTE_HID; pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); index = mphyp_pte_spillable_ident(pvo->pvo_pte.slot, &evicted); } @@ -500,18 +505,50 @@ /* * Set the new PTE. */ - result = phyp_pft_hcall(H_ENTER, H_EXACT, index, pte.pte_hi, - pte.pte_lo, &index, &evicted.pte_lo, &junk); - rm_wunlock(&mphyp_eviction_lock); /* All clear */ + result = phyp_pft_hcall(H_ENTER, H_EXACT, index, pte->pte_hi, + pte->pte_lo, &index, &evicted.pte_lo, &junk); pvo->pvo_pte.slot = index; if (result == H_SUCCESS) return (0); + rm_wunlock(&mphyp_eviction_lock); panic("Page replacement error: %ld", result); return (result); } +static int64_t +mphyp_pte_insert(struct pvo_entry *pvo) +{ + struct rm_priotracker track; + int64_t ret; + struct lpte pte; + + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + + /* Initialize PTE */ + moea64_pte_from_pvo(pvo, &pte); + + /* Make sure further insertion is locked out during evictions */ + rm_rlock(&mphyp_eviction_lock, &track); + + ret = mphyp_pte_insert_locked(pvo, &pte); + rm_runlock(&mphyp_eviction_lock, &track); + + if (ret == -1) { + /* + * Out of luck. Find a PTE to sacrifice. + */ + + /* Lock out all insertions for a bit */ + rm_wlock(&mphyp_eviction_lock); + ret = mphyp_pte_evict_and_insert_locked(pvo, &pte); + rm_wunlock(&mphyp_eviction_lock); /* All clear */ + } + + return (ret); +} + static void * mphyp_dump_pmap(void *ctx, void *buf, u_long *nbytes) { @@ -541,3 +578,91 @@ dctx->ptex = ptex; return (buf); } + +static int64_t +mphyp_pte_unset_sp(struct pvo_entry *pvo) +{ + struct lpte pte; + uint64_t junk, refchg; + int err; + vm_offset_t eva; + pmap_t pm; + + pm = pvo->pvo_pmap; + PMAP_LOCK_ASSERT(pm, MA_OWNED); + KASSERT((PVO_VADDR(pvo) & SP_MASK) == 0, + ("%s: va %#jx unaligned", __func__, (uintmax_t)PVO_VADDR(pvo))); + + refchg = 0; + eva = PVO_VADDR(pvo) + SP_SIZE; + + for (; pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + moea64_pte_from_pvo(pvo, &pte); + + err = phyp_pft_hcall(H_REMOVE, H_AVPN, pvo->pvo_pte.slot, + pte.pte_hi & LPTE_AVPN_MASK, 0, &pte.pte_hi, &pte.pte_lo, + &junk); + KASSERT(err == H_SUCCESS || err == H_NOT_FOUND, + ("Error removing page: %d", err)); + + if (err == H_NOT_FOUND) + STAT_MOEA64(moea64_pte_overflow--); + refchg |= pte.pte_lo & (LPTE_REF | LPTE_CHG); + } + + return (refchg); +} + +static int64_t +mphyp_pte_insert_sp(struct pvo_entry *pvo) +{ + struct rm_priotracker track; + int64_t ret; + struct lpte pte; + vm_offset_t eva; + pmap_t pm; + + pm = pvo->pvo_pmap; + PMAP_LOCK_ASSERT(pm, MA_OWNED); + KASSERT((PVO_VADDR(pvo) & SP_MASK) == 0, + ("%s: va %#jx unaligned", __func__, (uintmax_t)PVO_VADDR(pvo))); + + eva = PVO_VADDR(pvo) + SP_SIZE; + + /* Make sure further insertion is locked out during evictions */ + rm_rlock(&mphyp_eviction_lock, &track); + + for (; pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + /* Initialize PTE */ + moea64_pte_from_pvo(pvo, &pte); + + ret = mphyp_pte_insert_locked(pvo, &pte); + if (ret == -1) { + /* + * Out of luck. Find a PTE to sacrifice. + */ + + /* Lock out all insertions for a bit */ + rm_runlock(&mphyp_eviction_lock, &track); + rm_wlock(&mphyp_eviction_lock); + mphyp_pte_evict_and_insert_locked(pvo, &pte); + rm_wunlock(&mphyp_eviction_lock); /* All clear */ + rm_rlock(&mphyp_eviction_lock, &track); + } + } + + rm_runlock(&mphyp_eviction_lock, &track); + return (0); +} + +static int64_t +mphyp_pte_replace_sp(struct pvo_entry *pvo) +{ + int64_t refchg; + + refchg = mphyp_pte_unset_sp(pvo); + mphyp_pte_insert_sp(pvo); + return (refchg); +} Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -497,7 +497,8 @@ pidx += npages, m = vm_page_next(&m[npages - 1])) { vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset; #if defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ - __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv) + __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv) || \ + defined(__powerpc64__) psind = m->psind; if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 || pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last ||