Index: sys/powerpc/aim/mmu_oea64.h =================================================================== --- sys/powerpc/aim/mmu_oea64.h +++ sys/powerpc/aim/mmu_oea64.h @@ -129,6 +129,7 @@ extern u_long moea64_pteg_count; extern u_long moea64_pteg_mask; extern int n_slbs; +extern bool moea64_has_lp_4k_16m; #endif /* _POWERPC_AIM_MMU_OEA64_H */ Index: sys/powerpc/aim/mmu_oea64.c =================================================================== --- sys/powerpc/aim/mmu_oea64.c +++ sys/powerpc/aim/mmu_oea64.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include @@ -108,9 +109,6 @@ #define VSID_TO_HASH(vsid) (((vsid) >> 4) & 0xfffff) #define VSID_HASH_MASK 0x0000007fffffffffULL -/* Get physical address from PVO. */ -#define PVO_PADDR(pvo) ((pvo)->pvo_pte.pa & LPTE_RPGN) - /* * Locking semantics: * @@ -231,6 +229,7 @@ uint64_t moea64_large_page_mask = 0; uint64_t moea64_large_page_size = 0; int moea64_large_page_shift = 0; +bool moea64_has_lp_4k_16m = false; /* * PVO calls. @@ -252,6 +251,96 @@ static void moea64_syncicache(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_size_t sz); static void moea64_pmap_init_qpages(void); +static void moea64_remove_locked(pmap_t, vm_offset_t, + vm_offset_t, struct pvo_dlist *); + +/* + * Superpages data and routines. + */ +#define SP_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT) +#define SP_SIZE (1 << SP_SHIFT) +#define SP_MASK (SP_SIZE - 1) +#define SP_PAGES (1 << VM_LEVEL_0_ORDER) + +/* PVO (vaddr) bits that must match for promotion to succeed. */ +#define PVO_PROMOTE (PVO_WIRED | PVO_MANAGED | PVO_LARGE | \ + PVO_PTEGIDX_VALID) + +#define PVO_IS_SP(pvo) (((pvo)->pvo_vaddr & PVO_LARGE) && \ + (pvo)->pvo_pmap != kernel_pmap) + +/* Get physical address from PVO. */ +#define PVO_PADDR(pvo) moea64_pvo_paddr(pvo) + +/* MD page flag indicating that the page is a superpage. */ +#define MDPG_ATTR_SP 0x40000000 + +static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, + "VM/pmap parameters"); + +static int pg_ps_enabled = 0; +SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, + &pg_ps_enabled, 0, "Enable support for transparent superpages"); + +static SYSCTL_NODE(_vm_pmap, OID_AUTO, sp, CTLFLAG_RD, 0, + "SP page mapping counters"); + +static u_long sp_demotions; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, demotions, CTLFLAG_RD, + &sp_demotions, 0, "SP page demotions"); + +static u_long sp_mappings; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, mappings, CTLFLAG_RD, + &sp_mappings, 0, "SP page mappings"); + +static u_long sp_p_failures; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_failures, CTLFLAG_RD, + &sp_p_failures, 0, "SP page promotion failures"); + +static u_long sp_p_fail_pa; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_pa, CTLFLAG_RD, + &sp_p_fail_pa, 0, "SP page promotion failure: PAs don't match"); + +static u_long sp_p_fail_flags; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_flags, CTLFLAG_RD, + &sp_p_fail_flags, 0, "SP page promotion failure: page flags don't match"); + +static u_long sp_p_fail_prot; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_prot, CTLFLAG_RD, + &sp_p_fail_prot, 0, + "SP page promotion failure: page protections don't match"); + +static u_long sp_p_fail_wimg; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_wimg, CTLFLAG_RD, + &sp_p_fail_wimg, 0, "SP page promotion failure: WIMG bits don't match"); + +static u_long sp_promotions; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, promotions, CTLFLAG_RD, + &sp_promotions, 0, "SP page promotions"); + +static bool moea64_ps_enabled(pmap_t); +static void moea64_align_superpage(vm_object_t, vm_ooffset_t, + vm_offset_t *, vm_size_t); + +static int moea64_sp_enter(pmap_t pmap, vm_offset_t va, + vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind); +static struct pvo_entry *moea64_sp_remove(struct pvo_entry *sp, + struct pvo_dlist *tofree); + +static int moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m); +static void moea64_sp_demote_aligned(struct pvo_entry *sp); +static void moea64_sp_demote(struct pvo_entry *pvo); + +static struct pvo_entry *moea64_sp_unwire(struct pvo_entry *sp); +static struct pvo_entry *moea64_sp_protect(struct pvo_entry *sp, + vm_prot_t prot); + +static int64_t moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit); +static int64_t moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m, + uint64_t ptebit); + +static __inline bool moea64_sp_pvo_in_range(struct pvo_entry *pvo, + vm_offset_t sva, vm_offset_t eva); /* * Kernel MMU interface @@ -358,6 +447,8 @@ #ifdef __powerpc64__ .page_array_startup = moea64_page_array_startup, #endif + .ps_enabled = moea64_ps_enabled, + .align_superpage = moea64_align_superpage, /* Internal interfaces */ .mapdev = moea64_mapdev, @@ -377,6 +468,26 @@ MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods); +/* + * Get physical address from PVO. + * + * For superpages, the lower bits are not stored on pvo_pte.pa and must be + * obtained from VA. + */ +static __inline vm_paddr_t +moea64_pvo_paddr(struct pvo_entry *pvo) +{ + vm_paddr_t pa; + + pa = (pvo)->pvo_pte.pa & LPTE_RPGN; + + if (PVO_IS_SP(pvo)) { + pa &= ~SP_MASK; /* This is needed to clear LPTE_LP bits. */ + pa |= PVO_VADDR(pvo) & SP_MASK; + } + return (pa); +} + static struct pvo_head * vm_page_to_pvoh(vm_page_t m) { @@ -425,8 +536,10 @@ pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT) | (vsid << 16); - shift = (pvo->pvo_vaddr & PVO_LARGE) ? moea64_large_page_shift : - ADDR_PIDX_SHFT; + if (pmap == kernel_pmap && (pvo->pvo_vaddr & PVO_LARGE) != 0) + shift = moea64_large_page_shift; + else + shift = ADDR_PIDX_SHFT; hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift); pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3; } @@ -770,6 +883,9 @@ vm_paddr_t kernelphysstart, kernelphysend; int rm_pavail; + /* Level 0 reservations consist of 4096 pages (16MB superpage). */ + vm_level_0_order = 12; + #ifndef __powerpc64__ /* We don't have a direct map since there is no BAT */ hw_direct_map = 0; @@ -1201,6 +1317,17 @@ for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + if (PVO_IS_SP(pvo)) { + if (moea64_sp_pvo_in_range(pvo, sva, eva)) { + pvo = moea64_sp_unwire(pvo); + continue; + } else { + CTR1(KTR_PMAP, "%s: demote before unwire", + __func__); + moea64_sp_demote(pvo); + } + } + if ((pvo->pvo_vaddr & PVO_WIRED) == 0) panic("moea64_unwire: pvo %p is missing PVO_WIRED", pvo); @@ -1441,7 +1568,7 @@ moea64_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { - struct pvo_entry *pvo, *oldpvo; + struct pvo_entry *pvo, *oldpvo, *tpvo; struct pvo_head *pvo_head; uint64_t pte_lo; int error; @@ -1453,6 +1580,9 @@ VM_OBJECT_ASSERT_LOCKED(m->object); } + if (psind > 0) + return (moea64_sp_enter(pmap, va, m, prot, flags, psind)); + pvo = alloc_pvo_entry(0); if (pvo == NULL) return (KERN_RESOURCE_SHORTAGE); @@ -1476,6 +1606,15 @@ PMAP_LOCK(pmap); if (pvo->pvo_pmap == NULL) init_pvo_entry(pvo, pmap, va); + + tpvo = moea64_pvo_find_va(pmap, va & ~SP_MASK); + if (tpvo && PVO_IS_SP(tpvo)) { + /* Demote SP before entering a regular page */ + CTR2(KTR_PMAP, "%s: demote before enter: va=%#jx", + __func__, (uintmax_t)va); + moea64_sp_demote_aligned(tpvo); + } + if (prot & VM_PROT_WRITE) if (pmap_bootstrapped && (m->oflags & VPO_UNMANAGED) == 0) @@ -1499,6 +1638,7 @@ PV_PAGE_UNLOCK(m); PMAP_UNLOCK(pmap); free_pvo_entry(pvo); + pvo = NULL; goto out; } else { /* Otherwise, need to kill it first */ @@ -1527,6 +1667,19 @@ vm_page_aflag_set(m, PGA_EXECUTABLE); moea64_syncicache(pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE); } + + /* + * Try to promote pages. + * + * If the VA of the entered page is not aligned with its PA, + * don't try page promotion as it is not possible. + * This reduces the number of promotion failures dramatically. + */ + if (pmap != kernel_pmap && pvo != NULL && + (pvo->pvo_vaddr & PVO_MANAGED) != 0 && + (va & SP_MASK) == (VM_PAGE_TO_PHYS(m) & SP_MASK)) + moea64_sp_promote(pmap, va, m); + return (KERN_SUCCESS); } @@ -1585,15 +1738,25 @@ { vm_page_t m; vm_pindex_t diff, psize; + vm_offset_t va; + int8_t psind; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { - moea64_enter(pm, start + ptoa(diff), m, prot & - (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP | - PMAP_ENTER_QUICK_LOCKED, 0); + va = start + ptoa(diff); + if ((va & SP_MASK) == 0 && va + SP_SIZE <= end && + m->psind == 1 && moea64_ps_enabled(pm)) + psind = 1; + else + psind = 0; + moea64_enter(pm, va, m, prot & + (VM_PROT_READ | VM_PROT_EXECUTE), + PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, psind); + if (psind == 1) + m = &m[SP_SIZE / PAGE_SIZE - 1]; m = TAILQ_NEXT(m, listq); } } @@ -1707,6 +1870,27 @@ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); + /* + * Are large page mappings enabled? + */ + TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); + if (pg_ps_enabled) { + KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, + ("moea64_init: can't assign to pagesizes[1]")); + + if (moea64_large_page_size == 0) { + printf("mmu_oea64: HW does not support large pages. " + "Disabling superpages...\n"); + pg_ps_enabled = 0; + } else if (!moea64_has_lp_4k_16m) { + printf("mmu_oea64: " + "HW does not support mixed 4KB/16MB page sizes. " + "Disabling superpages...\n"); + pg_ps_enabled = 0; + } else + pagesizes[1] = SP_SIZE; + } + if (!hw_direct_map) { uma_zone_set_allocf(moea64_pvo_zone, moea64_uma_page_alloc); } @@ -1786,7 +1970,7 @@ vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) - return + return; powerpc_sync(); PV_PAGE_LOCK(m); @@ -1796,6 +1980,11 @@ PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { + if (PVO_IS_SP(pvo)) { + CTR1(KTR_PMAP, "%s: demote before remwr", + __func__); + moea64_sp_demote(pvo); + } pvo->pvo_pte.prot &= ~VM_PROT_WRITE; ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE); if (ret < 0) @@ -1844,6 +2033,9 @@ pmap_t pmap; uint64_t lo; + CTR3(KTR_PMAP, "%s: pa=%#jx, ma=%#x", + __func__, (uintmax_t)VM_PAGE_TO_PHYS(m), ma); + if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; @@ -1856,6 +2048,11 @@ pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) { + if (PVO_IS_SP(pvo)) { + CTR1(KTR_PMAP, + "%s: demote before set_memattr", __func__); + moea64_sp_demote(pvo); + } pvo->pvo_pte.pa &= ~LPTE_WIMG; pvo->pvo_pte.pa |= lo; refchg = moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE); @@ -2308,7 +2505,7 @@ moea64_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { - struct pvo_entry *pvo, *tpvo, key; + struct pvo_entry *pvo, key; CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm, sva, eva, prot); @@ -2324,8 +2521,18 @@ PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); - pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { - tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); + pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + if (PVO_IS_SP(pvo)) { + if (moea64_sp_pvo_in_range(pvo, sva, eva)) { + pvo = moea64_sp_protect(pvo, prot); + continue; + } else { + CTR1(KTR_PMAP, "%s: demote before protect", + __func__); + moea64_sp_demote(pvo); + } + } moea64_pvo_protect(pm, pvo, prot); } PMAP_UNLOCK(pm); @@ -2426,13 +2633,46 @@ } } +static void +moea64_remove_locked(pmap_t pm, vm_offset_t sva, vm_offset_t eva, + struct pvo_dlist *tofree) +{ + struct pvo_entry *pvo, *tpvo, key; + + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + key.pvo_vaddr = sva; + for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); + pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { + if (PVO_IS_SP(pvo)) { + if (moea64_sp_pvo_in_range(pvo, sva, eva)) { + tpvo = moea64_sp_remove(pvo, tofree); + continue; + } else { + CTR1(KTR_PMAP, "%s: demote before remove", + __func__); + moea64_sp_demote(pvo); + } + } + tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); + + /* + * For locking reasons, remove this from the page table and + * pmap, but save delinking from the vm_page for a second + * pass + */ + moea64_pvo_remove_from_pmap(pvo); + SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink); + } +} + /* * Remove the given range of addresses from the specified map. */ void moea64_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva) { - struct pvo_entry *pvo, *tpvo, key; + struct pvo_entry *pvo; struct pvo_dlist tofree; /* @@ -2441,23 +2681,9 @@ if (pm->pm_stats.resident_count == 0) return; - key.pvo_vaddr = sva; - SLIST_INIT(&tofree); - PMAP_LOCK(pm); - for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); - pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { - tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); - - /* - * For locking reasons, remove this from the page table and - * pmap, but save delinking from the vm_page for a second - * pass - */ - moea64_pvo_remove_from_pmap(pvo); - SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink); - } + moea64_remove_locked(pm, sva, eva, &tofree); PMAP_UNLOCK(pm); while (!SLIST_EMPTY(&tofree)) { @@ -2487,8 +2713,14 @@ pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); wasdead = (pvo->pvo_vaddr & PVO_DEAD); - if (!wasdead) + if (!wasdead) { + if (PVO_IS_SP(pvo)) { + CTR1(KTR_PMAP, "%s: demote before remove_all", + __func__); + moea64_sp_demote(pvo); + } moea64_pvo_remove_from_pmap(pvo); + } moea64_pvo_remove_from_page_locked(pvo, m); if (!wasdead) LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink); @@ -2721,11 +2953,18 @@ struct pvo_entry *pvo; int64_t ret; boolean_t rv; + vm_page_t sp; /* * See if this bit is stored in the page already. + * + * For superpages, the bit is stored in the first vm page. */ - if (m->md.mdpg_attrs & ptebit) + if ((m->md.mdpg_attrs & ptebit) != 0 || + ((sp = PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(m) & ~SP_MASK)) != NULL && + (sp->md.mdpg_attrs & (ptebit | MDPG_ATTR_SP)) == + (ptebit | MDPG_ATTR_SP)) + ) return (TRUE); /* @@ -2736,6 +2975,21 @@ powerpc_sync(); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + if (PVO_IS_SP(pvo)) { + ret = moea64_sp_query(pvo, ptebit); + /* + * If SP was not demoted, check its REF/CHG bits here. + */ + if (ret != -1) { + if ((ret & ptebit) != 0) { + rv = TRUE; + break; + } + continue; + } + /* else, fallthrough */ + } + ret = 0; /* @@ -2781,6 +3035,12 @@ count = 0; PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + if (PVO_IS_SP(pvo)) { + if ((ret = moea64_sp_clear(pvo, m, ptebit)) != -1) { + count += ret; + continue; + } + } ret = 0; PMAP_LOCK(pvo->pvo_pmap); @@ -3183,3 +3443,767 @@ DEFINE_OEA64_IFUNC(int64_t, pte_clear, (struct pvo_entry *, uint64_t), moea64_null_method) DEFINE_OEA64_IFUNC(int64_t, pte_synch, (struct pvo_entry *), moea64_null_method) + +/* Superpage functions */ + +/* MMU interface */ + +static bool +moea64_ps_enabled(pmap_t pmap) +{ + return (pg_ps_enabled); +} + +static void +moea64_align_superpage(vm_object_t object, vm_ooffset_t offset, + vm_offset_t *addr, vm_size_t size) +{ + vm_offset_t sp_offset; + + if (size < SP_SIZE) + return; + + CTR4(KTR_PMAP, "%s: offs=%#jx, addr=%p, size=%#jx", + __func__, (uintmax_t)offset, addr, (uintmax_t)size); + + if (object != NULL && (object->flags & OBJ_COLORED) != 0) + offset += ptoa(object->pg_color); + sp_offset = offset & SP_MASK; + if (size - ((SP_SIZE - sp_offset) & SP_MASK) < SP_SIZE || + (*addr & SP_MASK) == sp_offset) + return; + if ((*addr & SP_MASK) < sp_offset) + *addr = (*addr & ~SP_MASK) + sp_offset; + else + *addr = ((*addr + SP_MASK) & ~SP_MASK) + sp_offset; +} + +/* Helpers */ + +static __inline void +moea64_pvo_cleanup(struct pvo_dlist *tofree) +{ + struct pvo_entry *pvo; + + /* clean up */ + while (!SLIST_EMPTY(tofree)) { + pvo = SLIST_FIRST(tofree); + SLIST_REMOVE_HEAD(tofree, pvo_dlink); + if (pvo->pvo_vaddr & PVO_DEAD) + moea64_pvo_remove_from_page(pvo); + free_pvo_entry(pvo); + } +} + +static __inline uint16_t +pvo_to_vmpage_flags(struct pvo_entry *pvo) +{ + uint16_t flags; + + flags = 0; + if ((pvo->pvo_pte.prot & VM_PROT_WRITE) != 0) + flags |= PGA_WRITEABLE; + if ((pvo->pvo_pte.prot & VM_PROT_EXECUTE) != 0) + flags |= PGA_EXECUTABLE; + + return (flags); +} + +/* + * Check if the given pvo and its superpage are in sva-eva range. + */ +static __inline bool +moea64_sp_pvo_in_range(struct pvo_entry *pvo, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t spva; + + spva = PVO_VADDR(pvo) & ~SP_MASK; + if (spva >= sva && spva + SP_SIZE <= eva) { + /* + * Because this function is intended to be called from loops + * that iterate over ordered pvo entries, if the condition + * above is true then the pvo must be the first of its + * superpage. + */ + KASSERT(PVO_VADDR(pvo) == spva, + ("%s: unexpected unaligned superpage pvo", __func__)); + return (true); + } + return (false); +} + +/* + * Update vm about the REF/CHG bits if the superpage is managed and + * has (or had) write access. + */ +static void +moea64_sp_refchg_process(struct pvo_entry *sp, vm_page_t m, + int64_t sp_refchg, vm_prot_t prot) +{ + vm_page_t m_end; + int64_t refchg; + + if ((sp->pvo_vaddr & PVO_MANAGED) != 0 && (prot & VM_PROT_WRITE) != 0) { + for (m_end = &m[SP_PAGES]; m < m_end; m++) { + refchg = sp_refchg | + atomic_readandclear_32(&m->md.mdpg_attrs); + if (refchg & LPTE_CHG) + vm_page_dirty(m); + if (refchg & LPTE_REF) + vm_page_aflag_set(m, PGA_REFERENCED); + } + } +} + +/* Superpage ops */ + +static int +moea64_sp_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, u_int flags, int8_t psind) +{ + struct pvo_entry *pvo, **pvos; + struct pvo_head *pvo_head; + vm_offset_t sva; + vm_page_t sm; + vm_paddr_t pa; + bool sync; + struct pvo_dlist tofree; + int error, i; + uint16_t aflags; + + KASSERT((va & SP_MASK) == 0, ("%s: va %#jx unaligned", + __func__, (uintmax_t)va)); + KASSERT(psind == 1, ("%s: invalid psind: %d", __func__, psind)); + KASSERT(m->psind == 1, ("%s: invalid m->psind: %d", + __func__, m->psind)); + KASSERT(pmap != kernel_pmap, + ("%s: function called with kernel pmap", __func__)); + + CTR5(KTR_PMAP, "%s: va=%#jx, pa=%#jx, prot=%#x, flags=%#x, psind=1", + __func__, (uintmax_t)va, (uintmax_t)VM_PAGE_TO_PHYS(m), + prot, flags); + + SLIST_INIT(&tofree); + + sva = va; + sm = m; + pa = VM_PAGE_TO_PHYS(sm); + + /* Try to allocate all PVOs first, to make failure handling easier. */ + pvos = malloc(SP_PAGES * sizeof(struct pvo_entry *), M_TEMP, M_NOWAIT); + if (pvos == NULL) { + CTR1(KTR_PMAP, "%s: failed to alloc pvo array", __func__); + return (KERN_RESOURCE_SHORTAGE); + } + + for (i = 0; i < SP_PAGES; i++) { + pvos[i] = alloc_pvo_entry(0); + if (pvos[i] == NULL) { + CTR1(KTR_PMAP, "%s: failed to alloc pvo", __func__); + for (i = i - 1; i >= 0; i--) + free_pvo_entry(pvos[i]); + free(pvos, M_TEMP); + return (KERN_RESOURCE_SHORTAGE); + } + } + + PV_PAGE_LOCK(sm); + PMAP_LOCK(pmap); + + /* Note: moea64_remove_locked() also clears cached REF/CHG bits. */ + moea64_remove_locked(pmap, va, va + SP_SIZE, &tofree); + + /* Enter pages */ + for (i = 0; i < SP_PAGES; + i++, va += PAGE_SIZE, pa += PAGE_SIZE, m++) { + pvo = pvos[i]; + + pvo->pvo_pte.prot = prot; + pvo->pvo_pte.pa = (pa & ~LPTE_LP_MASK) | LPTE_LP_4K_16M | + moea64_calc_wimg(pa, pmap_page_get_memattr(m)); + + if ((flags & PMAP_ENTER_WIRED) != 0) + pvo->pvo_vaddr |= PVO_WIRED; + pvo->pvo_vaddr |= PVO_LARGE; + + if ((m->oflags & VPO_UNMANAGED) != 0) + pvo_head = NULL; + else { + pvo_head = &m->md.mdpg_pvoh; + pvo->pvo_vaddr |= PVO_MANAGED; + } + + init_pvo_entry(pvo, pmap, va); + + error = moea64_pvo_enter(pvo, pvo_head, NULL); + /* + * All superpage PVOs were previously removed, so no errors + * should occur while inserting the new ones. + */ + KASSERT(error == 0, ("%s: unexpected error " + "when inserting superpage PVO: %d", + __func__, error)); + } + + PMAP_UNLOCK(pmap); + PV_PAGE_UNLOCK(sm); + + sync = (sm->a.flags & PGA_EXECUTABLE) == 0; + /* Note: moea64_pvo_cleanup() also clears page prot. flags. */ + moea64_pvo_cleanup(&tofree); + pvo = pvos[0]; + + /* Set vm page flags */ + aflags = pvo_to_vmpage_flags(pvo); + if (aflags != 0) + for (m = sm; m < &sm[SP_PAGES]; m++) + vm_page_aflag_set(m, aflags); + + /* + * Flush the page from the instruction cache if this page is + * mapped executable and cacheable. + */ + if (sync && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) + moea64_syncicache(pmap, sva, VM_PAGE_TO_PHYS(sm), SP_SIZE); + + atomic_add_long(&sp_mappings, 1); + CTR3(KTR_PMAP, "%s: SP success for va %#jx in pmap %p", + __func__, (uintmax_t)sva, pmap); + + free(pvos, M_TEMP); + return (KERN_SUCCESS); +} + +static int +moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m) +{ + struct pvo_entry *first, *pvo; + vm_paddr_t pa, pa_end; + vm_offset_t sva, va_end; + int64_t sp_refchg; + + /* Return if page promotion is not possible. */ + if ((m->flags & PG_FICTITIOUS) != 0 || + vm_reserv_level_iffullpop(m) != 0 || !moea64_ps_enabled(pmap)) + return (1); + + /* This CTR may generate a lot of output. */ + /* CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)va); */ + + va &= ~SP_MASK; + sva = va; + /* Get superpage */ + m = PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(m) & ~SP_MASK); + + PV_PAGE_LOCK(m); + PMAP_LOCK(pmap); + + /* + * Check if all pages meet promotion criteria. + * + * XXX In some cases the loop below may be executed for each or most + * of the entered pages of a superpage, which can be expensive + * (although it was not profiled) and need some optimization. + * + * Some cases where this seems to happen are: + * - When a superpage is first entered read-only and later becomes + * read-write. + * - When some of the superpage's virtual addresses map to previously + * wired/cached pages while others map to pages allocated from a + * different physical address range. A common scenario where this + * happens is when mmap'ing a file that is already present in FS + * block cache and doesn't fill a superpage. + */ + first = pvo = moea64_pvo_find_va(pmap, sva); + for (pa = VM_PAGE_TO_PHYS(m), pa_end = pa + SP_SIZE; + pa < pa_end; pa += PAGE_SIZE, va += PAGE_SIZE) { + if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD) != 0) { + CTR3(KTR_PMAP, + "%s: NULL or dead PVO: pmap=%p, va=%#jx", + __func__, pmap, (uintmax_t)va); + goto error; + } + if (PVO_PADDR(pvo) != pa) { + CTR5(KTR_PMAP, "%s: PAs don't match: " + "pmap=%p, va=%#jx, pvo_pa=%#jx, exp_pa=%#jx", + __func__, pmap, (uintmax_t)va, + (uintmax_t)PVO_PADDR(pvo), (uintmax_t)pa); + atomic_add_long(&sp_p_fail_pa, 1); + goto error; + } + if ((first->pvo_vaddr & PVO_PROMOTE) != + (pvo->pvo_vaddr & PVO_PROMOTE)) { + CTR5(KTR_PMAP, "%s: PVO flags don't match: " + "pmap=%p, va=%#jx, pvo_flags=%#jx, exp_flags=%#jx", + __func__, pmap, (uintmax_t)va, + (uintmax_t)(pvo->pvo_vaddr & PVO_PROMOTE), + (uintmax_t)(first->pvo_vaddr & PVO_PROMOTE)); + atomic_add_long(&sp_p_fail_flags, 1); + goto error; + } + if (first->pvo_pte.prot != pvo->pvo_pte.prot) { + CTR5(KTR_PMAP, "%s: PVO protections don't match: " + "pmap=%p, va=%#jx, pvo_prot=%#x, exp_prot=%#x", + __func__, pmap, (uintmax_t)va, + pvo->pvo_pte.prot, first->pvo_pte.prot); + atomic_add_long(&sp_p_fail_prot, 1); + goto error; + } + if ((first->pvo_pte.pa & LPTE_WIMG) != + (pvo->pvo_pte.pa & LPTE_WIMG)) { + CTR5(KTR_PMAP, "%s: WIMG bits don't match: " + "pmap=%p, va=%#jx, pvo_wimg=%#jx, exp_wimg=%#jx", + __func__, pmap, (uintmax_t)va, + (uintmax_t)(pvo->pvo_pte.pa & LPTE_WIMG), + (uintmax_t)(first->pvo_pte.pa & LPTE_WIMG)); + atomic_add_long(&sp_p_fail_wimg, 1); + goto error; + } + + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo); + } + + /* All OK, promote. */ + + /* + * Handle superpage REF/CHG bits. If REF or CHG is set in + * any page, then it must be set in the superpage. + * + * Instead of querying each page, we take advantage of two facts: + * 1- If a page is being promoted, it was referenced. + * 2- If promoted pages are writable, they were modified. + */ + sp_refchg = LPTE_REF | + ((first->pvo_pte.prot & VM_PROT_WRITE) != 0 ? LPTE_CHG : 0); + + /* Promote pages */ + pvo = first; + for (va = PVO_VADDR(pvo), va_end = va + SP_SIZE; + va < va_end; va += PAGE_SIZE) { + KASSERT(pvo && PVO_VADDR(pvo) == va, + ("%s: missing PVO for va %#jx", __func__, (uintmax_t)va)); + pvo->pvo_pte.pa &= ~LPTE_LP_MASK; + pvo->pvo_pte.pa |= LPTE_LP_4K_16M; + pvo->pvo_vaddr |= PVO_LARGE; + + moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE); + + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo); + } + + /* Send REF/CHG bits to VM */ + moea64_sp_refchg_process(first, m, sp_refchg, first->pvo_pte.prot); + + /* Use first page to cache REF/CHG bits */ + atomic_set_32(&m->md.mdpg_attrs, sp_refchg | MDPG_ATTR_SP); + + PMAP_UNLOCK(pmap); + PV_PAGE_UNLOCK(m); + + atomic_add_long(&sp_mappings, 1); + atomic_add_long(&sp_promotions, 1); + CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p", + __func__, (uintmax_t)sva, pmap); + + return (0); + +error: + atomic_add_long(&sp_p_failures, 1); + PMAP_UNLOCK(pmap); + PV_PAGE_UNLOCK(m); + return (1); +} + +static void +moea64_sp_demote_aligned(struct pvo_entry *sp) +{ + struct pvo_entry *pvo; + vm_offset_t va, va_end; + vm_paddr_t pa; + vm_page_t m; + pmap_t pmap; + int64_t ret, refchg; + + CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp)); + + pmap = sp->pvo_pmap; + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + pvo = sp; + pa = PVO_PADDR(pvo); + m = PHYS_TO_VM_PAGE(pa); + refchg = 0; + + /* Demote pages */ + for (va = PVO_VADDR(pvo), va_end = va + SP_SIZE; + va < va_end; va += PAGE_SIZE, pa += PAGE_SIZE) { + KASSERT(pvo && PVO_VADDR(pvo) == va, + ("%s: missing PVO for va %#jx", __func__, (uintmax_t)va)); + pvo->pvo_vaddr &= ~PVO_LARGE; + pvo->pvo_pte.pa &= ~LPTE_RPGN; + pvo->pvo_pte.pa |= pa; + + ret = moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE); + if (ret < 0) + refchg |= LPTE_CHG; + else + refchg |= ret; + + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo); + } + + /* Clear SP flag */ + atomic_clear_32(&m->md.mdpg_attrs, MDPG_ATTR_SP); + + /* + * Handle superpage REF/CHG bits. A bit set in the superpage + * means all pages should consider it set. + */ + moea64_sp_refchg_process(sp, m, refchg, sp->pvo_pte.prot); + + atomic_add_long(&sp_demotions, 1); + CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p", + __func__, (uintmax_t)PVO_VADDR(sp), pmap); +} + +static void +moea64_sp_demote(struct pvo_entry *pvo) +{ + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + + if ((PVO_VADDR(pvo) & SP_MASK) != 0) { + pvo = moea64_pvo_find_va(pvo->pvo_pmap, + PVO_VADDR(pvo) & ~SP_MASK); + KASSERT(pvo != NULL, ("%s: missing PVO for va %#jx", + __func__, (uintmax_t)(PVO_VADDR(pvo) & ~SP_MASK))); + } + moea64_sp_demote_aligned(pvo); +} + +static struct pvo_entry * +moea64_sp_unwire(struct pvo_entry *sp) +{ + struct pvo_entry *pvo, *prev; + vm_offset_t eva; + pmap_t pm; + int64_t ret, refchg; + + CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp)); + + pm = sp->pvo_pmap; + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + eva = PVO_VADDR(sp) + SP_SIZE; + refchg = 0; + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; + prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + if ((pvo->pvo_vaddr & PVO_WIRED) == 0) + panic("%s: pvo %p is missing PVO_WIRED", + __func__, pvo); + pvo->pvo_vaddr &= ~PVO_WIRED; + + ret = moea64_pte_replace(pvo, 0 /* No invalidation */); + if (ret < 0) + refchg |= LPTE_CHG; + else + refchg |= ret; + + pm->pm_stats.wired_count--; + } + + /* Send REF/CHG bits to VM */ + moea64_sp_refchg_process(sp, PHYS_TO_VM_PAGE(PVO_PADDR(sp)), + refchg, sp->pvo_pte.prot); + + return (prev); +} + +static struct pvo_entry * +moea64_sp_protect(struct pvo_entry *sp, vm_prot_t prot) +{ + struct pvo_entry *pvo, *prev; + vm_offset_t eva; + pmap_t pm; + vm_page_t m, m_end; + int64_t ret, refchg; + vm_prot_t oldprot; + + CTR3(KTR_PMAP, "%s: va=%#jx, prot=%x", + __func__, (uintmax_t)PVO_VADDR(sp), prot); + + pm = sp->pvo_pmap; + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + oldprot = sp->pvo_pte.prot; + m = PHYS_TO_VM_PAGE(PVO_PADDR(sp)); + KASSERT(m != NULL, ("%s: missing vm page for pa %#jx", + __func__, (uintmax_t)PVO_PADDR(sp))); + eva = PVO_VADDR(sp) + SP_SIZE; + refchg = 0; + + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; + prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + pvo->pvo_pte.prot = prot; + /* + * If the PVO is in the page table, update mapping + */ + ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE); + if (ret < 0) + refchg |= LPTE_CHG; + else + refchg |= ret; + } + + /* Send REF/CHG bits to VM */ + moea64_sp_refchg_process(sp, m, refchg, oldprot); + + /* Handle pages that became executable */ + if ((m->a.flags & PGA_EXECUTABLE) == 0 && + (sp->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { + if ((m->oflags & VPO_UNMANAGED) == 0) + for (m_end = &m[SP_PAGES]; m < m_end; m++) + vm_page_aflag_set(m, PGA_EXECUTABLE); + moea64_syncicache(pm, PVO_VADDR(sp), PVO_PADDR(sp), SP_SIZE); + } + + return (prev); +} + +static struct pvo_entry * +moea64_sp_remove(struct pvo_entry *sp, struct pvo_dlist *tofree) +{ + struct pvo_entry *pvo, *tpvo; + vm_offset_t eva; + pmap_t pm; + + CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp)); + + pm = sp->pvo_pmap; + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + eva = PVO_VADDR(sp) + SP_SIZE; + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { + tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); + + /* + * For locking reasons, remove this from the page table and + * pmap, but save delinking from the vm_page for a second + * pass + */ + moea64_pvo_remove_from_pmap(pvo); + SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink); + } + + /* Clear SP bit */ + atomic_clear_32(&PHYS_TO_VM_PAGE(PVO_PADDR(sp))->md.mdpg_attrs, + MDPG_ATTR_SP); + + return (tpvo); +} + +static int64_t +moea64_sp_query_locked(struct pvo_entry *pvo, uint64_t ptebit) +{ + int64_t refchg, ret; + vm_offset_t eva; + vm_page_t m; + pmap_t pmap; + struct pvo_entry *sp; + + pmap = pvo->pvo_pmap; + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* Get first SP PVO */ + if ((PVO_VADDR(pvo) & SP_MASK) != 0) { + sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~SP_MASK); + KASSERT(sp != NULL, ("%s: missing PVO for va %#jx", + __func__, (uintmax_t)(PVO_VADDR(pvo) & ~SP_MASK))); + } else + sp = pvo; + eva = PVO_VADDR(sp) + SP_SIZE; + + refchg = 0; + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) { + ret = moea64_pte_synch(pvo); + if (ret > 0) { + refchg |= ret & (LPTE_CHG | LPTE_REF); + if ((refchg & ptebit) != 0) + break; + } + } + + /* Save results */ + if (refchg != 0) { + m = PHYS_TO_VM_PAGE(PVO_PADDR(sp)); + atomic_set_32(&m->md.mdpg_attrs, refchg | MDPG_ATTR_SP); + } + + return (refchg); +} + +static int64_t +moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit) +{ + int64_t refchg; + pmap_t pmap; + + pmap = pvo->pvo_pmap; + PMAP_LOCK(pmap); + + /* + * Check if SP was demoted/removed before pmap lock was acquired. + */ + if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) { + CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx", + __func__, (uintmax_t)PVO_PADDR(pvo)); + PMAP_UNLOCK(pmap); + return (-1); + } + + refchg = moea64_sp_query_locked(pvo, ptebit); + PMAP_UNLOCK(pmap); + + CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx", + __func__, (uintmax_t)PVO_VADDR(pvo), + (uintmax_t)PVO_PADDR(pvo), (uintmax_t)refchg); + + return (refchg); +} + +static int64_t +moea64_sp_pvo_clear(struct pvo_entry *pvo, uint64_t ptebit) +{ + int64_t refchg, ret; + pmap_t pmap; + struct pvo_entry *sp; + vm_offset_t eva; + vm_page_t m; + + pmap = pvo->pvo_pmap; + PMAP_LOCK(pmap); + + /* + * Check if SP was demoted/removed before pmap lock was acquired. + */ + if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) { + CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx", + __func__, (uintmax_t)PVO_PADDR(pvo)); + PMAP_UNLOCK(pmap); + return (-1); + } + + /* Get first SP PVO */ + if ((PVO_VADDR(pvo) & SP_MASK) != 0) { + sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~SP_MASK); + KASSERT(sp != NULL, ("%s: missing PVO for va %#jx", + __func__, (uintmax_t)(PVO_VADDR(pvo) & ~SP_MASK))); + } else + sp = pvo; + eva = PVO_VADDR(sp) + SP_SIZE; + + refchg = 0; + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) { + ret = moea64_pte_clear(pvo, ptebit); + if (ret > 0) + refchg |= ret & (LPTE_CHG | LPTE_REF); + } + + m = PHYS_TO_VM_PAGE(PVO_PADDR(sp)); + atomic_clear_32(&m->md.mdpg_attrs, ptebit); + PMAP_UNLOCK(pmap); + + CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx", + __func__, (uintmax_t)PVO_VADDR(sp), + (uintmax_t)PVO_PADDR(sp), (uintmax_t)refchg); + + return (refchg); +} + +static int64_t +moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m, uint64_t ptebit) +{ + int64_t count, ret; + pmap_t pmap; + + count = 0; + pmap = pvo->pvo_pmap; + + /* + * Since this reference bit is shared by 4096 4KB pages, it + * should not be cleared every time it is tested. Apply a + * simple "hash" function on the physical page number, the + * virtual superpage number, and the pmap address to select + * one 4KB page out of the 4096 on which testing the + * reference bit will result in clearing that reference bit. + * This function is designed to avoid the selection of the + * same 4KB page for every 16MB page mapping. + * + * Always leave the reference bit of a wired mapping set, as + * the current state of its reference bit won't affect page + * replacement. + */ + if (ptebit == LPTE_REF && (((VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) ^ + (PVO_VADDR(pvo) >> SP_SHIFT) ^ (uintptr_t)pmap) & + (SP_PAGES - 1)) == 0 && (pvo->pvo_vaddr & PVO_WIRED) == 0) { + if ((ret = moea64_sp_pvo_clear(pvo, ptebit)) == -1) + return (-1); + + if ((ret & ptebit) != 0) + count++; + + /* + * If this page was not selected by the hash function, then assume + * its REF bit was set. + */ + } else if (ptebit == LPTE_REF) { + count++; + + /* + * To clear the CHG bit of a single SP page, first it must be demoted. + * But if no CHG bit is set, no bit clear and thus no SP demotion is + * needed. + */ + } else { + CTR4(KTR_PMAP, "%s: ptebit=%#jx, va=%#jx, pa=%#jx", + __func__, (uintmax_t)ptebit, (uintmax_t)PVO_VADDR(pvo), + (uintmax_t)PVO_PADDR(pvo)); + + PMAP_LOCK(pmap); + + /* + * Make sure SP wasn't demoted/removed before pmap lock + * was acquired. + */ + if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) { + CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx", + __func__, (uintmax_t)PVO_PADDR(pvo)); + PMAP_UNLOCK(pmap); + return (-1); + } + + ret = moea64_sp_query_locked(pvo, ptebit); + if ((ret & ptebit) != 0) + count++; + else { + PMAP_UNLOCK(pmap); + return (0); + } + + moea64_sp_demote(pvo); + moea64_pte_clear(pvo, ptebit); + + /* + * Write protect the mapping to a single page so that a + * subsequent write access may repromote. + */ + if ((pvo->pvo_vaddr & PVO_WIRED) == 0) + moea64_pvo_protect(pmap, pvo, + pvo->pvo_pte.prot & ~VM_PROT_WRITE); + + PMAP_UNLOCK(pmap); + } + + return (count); +} Index: sys/powerpc/aim/moea64_native.c =================================================================== --- sys/powerpc/aim/moea64_native.c +++ sys/powerpc/aim/moea64_native.c @@ -117,6 +117,7 @@ #include #include +#include #include #include @@ -132,11 +133,65 @@ /* POWER9 only permits a 64k partition table size. */ #define PART_SIZE 0x10000 +/* Actual page sizes (to be used with tlbie, when L=0) */ +#define AP_4K 0x00 +#define AP_16M 0x80 + +#define LPTE_KERNEL_VSID_BIT (KERNEL_VSID_BIT << \ + (16 - (ADDR_API_SHFT64 - ADDR_PIDX_SHFT))) + static bool moea64_crop_tlbie; static bool moea64_need_lock; +#ifdef __powerpc64__ + +/* + * The tlbie instruction has two forms: an old one used by PowerISA + * 2.03 and prior, and a newer one used by PowerISA 2.06 and later. + * We need to support both. + */ + +static void +__tlbie_old(uint64_t vpn, uint64_t oldptehi) +{ + if ((oldptehi & LPTE_BIG) != 0) + __asm __volatile("tlbie %0, 1" :: "r"(vpn) : "memory"); + else + __asm __volatile("tlbie %0, 0" :: "r"(vpn) : "memory"); + __asm __volatile("eieio; tlbsync; ptesync" ::: "memory"); +} + +static void +__tlbie_new(uint64_t vpn, uint64_t oldptehi) +{ + uint64_t rb; + + /* + * If this page has LPTE_BIG set and is from userspace, then + * it must be a superpage with 4KB base/16MB actual page size. + */ + rb = vpn; + if ((oldptehi & LPTE_BIG) != 0 && + (oldptehi & LPTE_KERNEL_VSID_BIT) == 0) + rb |= AP_16M; + + __asm __volatile("li 0, 0 \n tlbie %0, 0" :: "r"(rb) : "r0", "memory"); + __asm __volatile("eieio; tlbsync; ptesync" ::: "memory"); +} + +DEFINE_IFUNC(, void, __tlbie, (uint64_t vpn, uint64_t oldptehi)) +{ + if (cpu_features & PPC_FEATURE_ARCH_2_06) + return (__tlbie_new); + else + return (__tlbie_old); +} + +#endif + static __inline void -TLBIE(uint64_t vpn) { +TLBIE(uint64_t vpn, uint64_t oldptehi) +{ #ifndef __powerpc64__ register_t vpn_hi, vpn_lo; register_t msr; @@ -158,18 +213,7 @@ } #ifdef __powerpc64__ - /* - * Explicitly clobber r0. The tlbie instruction has two forms: an old - * one used by PowerISA 2.03 and prior, and a newer one used by PowerISA - * 2.06 (maybe 2.05?) and later. We need to support both, and it just - * so happens that since we use 4k pages we can simply zero out r0, and - * clobber it, and the assembler will interpret the single-operand form - * of tlbie as having RB set, and everything else as 0. The RS operand - * in the newer form is in the same position as the L(page size) bit of - * the old form, so a slong as RS is 0, we're good on both sides. - */ - __asm __volatile("li 0, 0 \n tlbie %0" :: "r"(vpn) : "r0", "memory"); - __asm __volatile("eieio; tlbsync; ptesync" ::: "memory"); + __tlbie(vpn, oldptehi); #else vpn_hi = (uint32_t)(vpn >> 32); vpn_lo = (uint32_t)vpn; @@ -321,7 +365,7 @@ rw_runlock(&moea64_eviction_lock); critical_enter(); - TLBIE(pvo->pvo_vpn); + TLBIE(pvo->pvo_vpn, properpt.pte_hi); critical_exit(); } else { rw_runlock(&moea64_eviction_lock); @@ -356,7 +400,7 @@ critical_enter(); pt->pte_hi = be64toh((pt->pte_hi & ~LPTE_VALID) | LPTE_LOCKED); PTESYNC(); - TLBIE(pvo->pvo_vpn); + TLBIE(pvo->pvo_vpn, pt->pte_hi); ptelo = be64toh(pt->pte_lo); *((volatile int32_t *)(&pt->pte_hi) + 1) = 0; /* Release lock */ critical_exit(); @@ -394,7 +438,7 @@ critical_enter(); pt->pte_hi = be64toh((pt->pte_hi & ~LPTE_VALID) | LPTE_LOCKED); PTESYNC(); - TLBIE(pvo->pvo_vpn); + TLBIE(pvo->pvo_vpn, pt->pte_hi); ptelo = be64toh(pt->pte_lo); EIEIO(); pt->pte_lo = htobe64(properpt.pte_lo); @@ -702,7 +746,7 @@ va |= (oldptehi & LPTE_AVPN_MASK) << (ADDR_API_SHFT64 - ADDR_PIDX_SHFT); PTESYNC(); - TLBIE(va); + TLBIE(va, oldptehi); STAT_MOEA64(moea64_pte_valid--); STAT_MOEA64(moea64_pte_overflow++); } Index: sys/powerpc/include/param.h =================================================================== --- sys/powerpc/include/param.h +++ sys/powerpc/include/param.h @@ -120,6 +120,15 @@ #define L3_PAGE_SIZE (1UL<pvo_vaddr & ~ADDR_POFF) #define PVO_PTEGIDX_GET(pvo) ((pvo)->pvo_vaddr & PVO_PTEGIDX_MASK) #define PVO_PTEGIDX_ISSET(pvo) ((pvo)->pvo_vaddr & PVO_PTEGIDX_VALID) Index: sys/powerpc/include/pte.h =================================================================== --- sys/powerpc/include/pte.h +++ sys/powerpc/include/pte.h @@ -120,8 +120,13 @@ #define LPTE_VALID 0x0000000000000001ULL /* Low quadword: */ +#define LP_4K_16M 0x38 /* 4KB base, 16MB actual page size */ + #define EXTEND_PTE(x) UINT64_C(x) /* make constants 64-bit */ #define LPTE_RPGN 0xfffffffffffff000ULL +#define LPTE_LP_MASK 0x00000000000ff000ULL +#define LPTE_LP_SHIFT 12 +#define LPTE_LP_4K_16M ((unsigned long long)(LP_4K_16M) << LPTE_LP_SHIFT) #define LPTE_REF EXTEND_PTE( PTE_REF ) #define LPTE_CHG EXTEND_PTE( PTE_CHG ) #define LPTE_WIMG EXTEND_PTE( PTE_WIMG ) Index: sys/powerpc/include/slb.h =================================================================== --- sys/powerpc/include/slb.h +++ sys/powerpc/include/slb.h @@ -64,6 +64,14 @@ #define SLBE_ESID_MASK 0xfffffffff0000000UL /* Effective segment ID mask */ #define SLBE_ESID_SHIFT 28 +/* + * SLB page sizes encoding, as present in property ibm,segment-page-sizes + * of CPU device tree node. + * + * See LoPAPR: CPU Node Properties, section C.6.1.4. + */ +#define SLB_PGSZ_4K_4K 0 + /* Virtual real-mode VSID in LPARs */ #define VSID_VRMA 0x1ffffff Index: sys/powerpc/include/vmparam.h =================================================================== --- sys/powerpc/include/vmparam.h +++ sys/powerpc/include/vmparam.h @@ -186,30 +186,41 @@ #define VM_FREELIST_DEFAULT 0 /* - * The largest allocation size is 4MB. + * By default, enable superpages for PPC64, except for BOOKE (that uses + * a different MMU). */ +#if defined(__powerpc64__) && !defined(BOOKE) +#define PPC_SUPERPAGES +#endif + #ifdef __powerpc64__ +/* The largest allocation size is 16MB. */ #define VM_NFREEORDER 13 #else +/* The largest allocation size is 4MB. */ #define VM_NFREEORDER 11 #endif #ifndef VM_NRESERVLEVEL #ifdef __powerpc64__ +/* Enable superpage reservations: 1 level. */ #define VM_NRESERVLEVEL 1 #else -/* - * Disable superpage reservations. - */ +/* Disable superpage reservations. */ #define VM_NRESERVLEVEL 0 #endif #endif -/* - * Level 0 reservations consist of 512 pages. - */ #ifndef VM_LEVEL_0_ORDER -#define VM_LEVEL_0_ORDER 9 +/* Level 0 reservations consist of 512 (RPT) or 4096 (HPT) pages. */ +#define VM_LEVEL_0_ORDER vm_level_0_order +#ifndef __ASSEMBLER__ +extern int vm_level_0_order; +#endif +#endif + +#ifndef VM_LEVEL_0_ORDER_MAX +#define VM_LEVEL_0_ORDER_MAX 12 #endif #ifdef __powerpc64__ Index: sys/powerpc/powernv/platform_powernv.c =================================================================== --- sys/powerpc/powernv/platform_powernv.c +++ sys/powerpc/powernv/platform_powernv.c @@ -142,6 +142,7 @@ phandle_t opal; int res, len, idx; register_t msr; + bool has_lp; /* Ping OPAL again just to make sure */ opal_check(); @@ -225,6 +226,7 @@ sizeof(arr)); len /= 4; idx = 0; + has_lp = false; while (len > 0) { shift = arr[idx]; slb_encoding = arr[idx + 1]; @@ -235,17 +237,21 @@ lp_size = arr[idx]; lp_encoding = arr[idx+1]; if (slb_encoding == SLBV_L && lp_encoding == 0) - break; + has_lp = true; + + if (slb_encoding == SLB_PGSZ_4K_4K && + lp_encoding == LP_4K_16M) + moea64_has_lp_4k_16m = true; idx += 2; len -= 2; nptlp--; } - if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0) + if (has_lp && moea64_has_lp_4k_16m) break; } - if (len == 0) + if (!has_lp) panic("Standard large pages (SLB[L] = 1, PTE[LP] = 0) " "not supported by this system."); Index: sys/powerpc/powerpc/pmap_dispatch.c =================================================================== --- sys/powerpc/powerpc/pmap_dispatch.c +++ sys/powerpc/powerpc/pmap_dispatch.c @@ -77,6 +77,8 @@ caddr_t crashdumpmap; int pmap_bootstrapped; +/* Default level 0 reservations consist of 512 pages (2MB superpage). */ +int vm_level_0_order = 9; #ifdef AIM int Index: sys/powerpc/pseries/mmu_phyp.c =================================================================== --- sys/powerpc/pseries/mmu_phyp.c +++ sys/powerpc/pseries/mmu_phyp.c @@ -135,6 +135,7 @@ uint64_t vsid; phandle_t dev, node, root; int idx, len, res; + bool has_lp; rm_init(&mphyp_eviction_lock, "pte eviction"); @@ -199,6 +200,7 @@ sizeof(arr)); len /= 4; idx = 0; + has_lp = false; while (len > 0) { shift = arr[idx]; slb_encoding = arr[idx + 1]; @@ -220,18 +222,22 @@ lp_encoding); if (slb_encoding == SLBV_L && lp_encoding == 0) - break; + has_lp = true; + + if (slb_encoding == SLB_PGSZ_4K_4K && + lp_encoding == LP_4K_16M) + moea64_has_lp_4k_16m = true; idx += 2; len -= 2; nptlp--; } dprintf("\n"); - if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0) + if (has_lp && moea64_has_lp_4k_16m) break; } - if (len > 0) { + if (has_lp) { moea64_large_page_shift = shift; moea64_large_page_size = 1ULL << lp_size; moea64_large_page_mask = moea64_large_page_size - 1; Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -497,7 +497,8 @@ pidx += npages, m = vm_page_next(&m[npages - 1])) { vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset; #if defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ - __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv) + __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv) || \ + defined(__powerpc64__) psind = m->psind; if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 || pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last ||