Index: sys/powerpc/aim/mmu_oea64.h =================================================================== --- sys/powerpc/aim/mmu_oea64.h +++ sys/powerpc/aim/mmu_oea64.h @@ -129,6 +129,7 @@ extern u_long moea64_pteg_count; extern u_long moea64_pteg_mask; extern int n_slbs; +extern bool moea64_has_lp_4k_16m; #endif /* _POWERPC_AIM_MMU_OEA64_H */ Index: sys/powerpc/aim/mmu_oea64.c =================================================================== --- sys/powerpc/aim/mmu_oea64.c +++ sys/powerpc/aim/mmu_oea64.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include @@ -228,6 +229,7 @@ uint64_t moea64_large_page_mask = 0; uint64_t moea64_large_page_size = 0; int moea64_large_page_shift = 0; +bool moea64_has_lp_4k_16m = false; /* * PVO calls. @@ -249,6 +251,116 @@ static void moea64_syncicache(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_size_t sz); static void moea64_pmap_init_qpages(void); +static void moea64_remove_locked(pmap_t, vm_offset_t, + vm_offset_t, struct pvo_dlist *); + +/* + * Superpages data and routines. + */ + +#ifdef PPC_SUPERPAGES + +#define SP_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT) +#define SP_SIZE (1 << SP_SHIFT) +#define SP_MASK (SP_SIZE - 1) +#define SP_PAGES (1 << VM_LEVEL_0_ORDER) + +/* PVO (vaddr) bits that must match for promotion to succeed. */ +#define PVO_PROMOTE (PVO_WIRED | PVO_MANAGED | PVO_LARGE | \ + PVO_PTEGIDX_VALID) + +#define PVO_IS_SP(pvo) (((pvo)->pvo_vaddr & PVO_LARGE) && \ + (pvo)->pvo_pmap != kernel_pmap) + +/* Get physical address from PVO. */ +#define PVO_PADDR(pvo) moea64_pvo_paddr(pvo) + +/* MD page flag indicating that the page is a superpage. */ +#define MDPG_ATTR_SP 0x40000000 + +/* data */ + +static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, + "VM/pmap parameters"); + +static int sp_enabled = 0; +SYSCTL_INT(_vm_pmap, OID_AUTO, sp_enabled, CTLFLAG_RDTUN, + &sp_enabled, 0, "Enable support for transparent superpages"); + +static SYSCTL_NODE(_vm_pmap, OID_AUTO, sp, CTLFLAG_RD, 0, + "SP page mapping counters"); + +static u_long sp_demotions; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, demotions, CTLFLAG_RD, + &sp_demotions, 0, + "SP page demotions"); + +static u_long sp_mappings; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, mappings, CTLFLAG_RD, + &sp_mappings, 0, + "SP page mappings"); + +static u_long sp_p_failures; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_failures, CTLFLAG_RD, + &sp_p_failures, 0, + "SP page promotion failures"); + +static u_long sp_promotions; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, promotions, CTLFLAG_RD, + &sp_promotions, 0, + "SP page promotions"); + +/* protos */ + +static bool moea64_ps_enabled(pmap_t); +static void moea64_align_superpage(vm_object_t, vm_ooffset_t, + vm_offset_t *, vm_size_t); + +static int moea64_sp_enter(pmap_t pmap, vm_offset_t va, + vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind); +static struct pvo_entry *moea64_sp_remove(struct pvo_entry *sp, + struct pvo_dlist *tofree); + +static int moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m); +static void moea64_sp_demote_aligned(struct pvo_entry *sp); +static void moea64_sp_demote(struct pvo_entry *pvo); + +static struct pvo_entry *moea64_sp_unwire(struct pvo_entry *sp); +static struct pvo_entry *moea64_sp_protect(struct pvo_entry *sp, + vm_prot_t prot); + +static int64_t moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit); +static int64_t moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m, + uint64_t ptebit); + +static __inline bool moea64_sp_pvo_in_range(struct pvo_entry *pvo, + vm_offset_t sva, vm_offset_t eva); + +/* + * Get physical address from PVO. + * + * For superpages, the lower bits are not stored on pvo_pte.pa and must be + * obtained from VA. + */ +static __inline vm_paddr_t +moea64_pvo_paddr(struct pvo_entry *pvo) +{ + vm_paddr_t pa; + + pa = (pvo)->pvo_pte.pa & LPTE_RPGN; + + if (PVO_IS_SP(pvo)) { + pa &= ~SP_MASK; /* This is needed to clear LPTE_LP bits. */ + pa |= PVO_VADDR(pvo) & SP_MASK; + } + return (pa); +} + +#else /* !defined(PPC_SUPERPAGES) */ + +#define PVO_PADDR(pvo) ((pvo)->pvo_pte.pa & LPTE_RPGN) + +#endif /* * Kernel MMU interface @@ -355,6 +467,10 @@ #ifdef __powerpc64__ .page_array_startup = moea64_page_array_startup, #endif +#ifdef PPC_SUPERPAGES + .ps_enabled = moea64_ps_enabled, + .align_superpage = moea64_align_superpage, +#endif /* Internal interfaces */ .mapdev = moea64_mapdev, @@ -422,8 +538,10 @@ pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT) | (vsid << 16); - shift = (pvo->pvo_vaddr & PVO_LARGE) ? moea64_large_page_shift : - ADDR_PIDX_SHFT; + if (pmap == kernel_pmap && (pvo->pvo_vaddr & PVO_LARGE) != 0) + shift = moea64_large_page_shift; + else + shift = ADDR_PIDX_SHFT; hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift); pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3; } @@ -767,6 +885,9 @@ vm_paddr_t kernelphysstart, kernelphysend; int rm_pavail; + /* Level 0 reservations consist of 4096 pages (16MB superpage). */ + vm_level_0_order = 12; + #ifndef __powerpc64__ /* We don't have a direct map since there is no BAT */ hw_direct_map = 0; @@ -1198,6 +1319,19 @@ for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { +#ifdef PPC_SUPERPAGES + if (PVO_IS_SP(pvo)) { + if (moea64_sp_pvo_in_range(pvo, sva, eva)) { + pvo = moea64_sp_unwire(pvo); + continue; + } else { + CTR1(KTR_PMAP, "%s: demote before unwire", + __func__); + moea64_sp_demote(pvo); + } + } +#endif + if ((pvo->pvo_vaddr & PVO_WIRED) == 0) panic("moea64_unwire: pvo %p is missing PVO_WIRED", pvo); @@ -1207,7 +1341,7 @@ (pvo->pvo_pte.prot & VM_PROT_WRITE)) { if (refchg < 0) refchg = LPTE_CHG; - m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); + m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); refchg |= atomic_readandclear_32(&m->md.mdpg_attrs); if (refchg & LPTE_CHG) @@ -1442,6 +1576,9 @@ struct pvo_head *pvo_head; uint64_t pte_lo; int error; +#ifdef PPC_SUPERPAGES + struct pvo_entry *tpvo; +#endif if ((m->oflags & VPO_UNMANAGED) == 0) { if ((flags & PMAP_ENTER_QUICK_LOCKED) == 0) @@ -1450,6 +1587,11 @@ VM_OBJECT_ASSERT_LOCKED(m->object); } +#ifdef PPC_SUPERPAGES + if (psind > 0) + return (moea64_sp_enter(pmap, va, m, prot, flags, psind)); +#endif + pvo = alloc_pvo_entry(0); if (pvo == NULL) return (KERN_RESOURCE_SHORTAGE); @@ -1473,6 +1615,17 @@ PMAP_LOCK(pmap); if (pvo->pvo_pmap == NULL) init_pvo_entry(pvo, pmap, va); + +#ifdef PPC_SUPERPAGES + tpvo = moea64_pvo_find_va(pmap, va & ~SP_MASK); + if (tpvo && PVO_IS_SP(tpvo)) { + /* Demote SP before entering a regular page */ + CTR2(KTR_PMAP, "%s: demote before enter: va=%#jx", + __func__, (uintmax_t)va); + moea64_sp_demote_aligned(tpvo); + } +#endif + if (prot & VM_PROT_WRITE) if (pmap_bootstrapped && (m->oflags & VPO_UNMANAGED) == 0) @@ -1496,6 +1649,7 @@ PV_PAGE_UNLOCK(m); PMAP_UNLOCK(pmap); free_pvo_entry(pvo); + pvo = NULL; goto out; } else { /* Otherwise, need to kill it first */ @@ -1524,6 +1678,14 @@ vm_page_aflag_set(m, PGA_EXECUTABLE); moea64_syncicache(pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE); } + +#ifdef PPC_SUPERPAGES + /* Try to promote pages */ + if (pmap != kernel_pmap && pvo != NULL && + (pvo->pvo_vaddr & PVO_MANAGED) != 0) + moea64_sp_promote(pmap, va, m); +#endif + return (KERN_SUCCESS); } @@ -1582,15 +1744,29 @@ { vm_page_t m; vm_pindex_t diff, psize; + vm_offset_t va; + int8_t psind; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { - moea64_enter(pm, start + ptoa(diff), m, prot & - (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP | - PMAP_ENTER_QUICK_LOCKED, 0); + va = start + ptoa(diff); +#ifdef PPC_SUPERPAGES + if ((va & SP_MASK) == 0 && va + SP_SIZE <= end && + m->psind == 1 && moea64_ps_enabled(pm)) + psind = 1; + else +#endif + psind = 0; + moea64_enter(pm, va, m, prot & + (VM_PROT_READ | VM_PROT_EXECUTE), + PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, psind); +#ifdef PPC_SUPERPAGES + if (psind == 1) + m = &m[SP_SIZE / PAGE_SIZE - 1]; +#endif m = TAILQ_NEXT(m, listq); } } @@ -1615,7 +1791,7 @@ if (pvo == NULL) pa = 0; else - pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo)); + pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(pm); return (pa); @@ -1636,7 +1812,7 @@ PMAP_LOCK(pmap); pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); if (pvo != NULL && (pvo->pvo_pte.prot & prot) == prot) { - m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); + m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); if (!vm_page_wire_mapped(m)) m = NULL; } @@ -1695,7 +1871,7 @@ extern int elf32_nxstack; void -moea64_init() +moea64_init(void) { CTR0(KTR_PMAP, "moea64_init"); @@ -1704,6 +1880,29 @@ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); +#ifdef PPC_SUPERPAGES + /* + * Are large page mappings enabled? + */ + TUNABLE_INT_FETCH("vm.pmap.sp_enabled", &sp_enabled); + if (sp_enabled) { + KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, + ("moea64_init: can't assign to pagesizes[1]")); + + if (!hw_direct_map) { + printf("mmu_oea64: HW does not support direct map. " + "Disabling superpages...\n"); + sp_enabled = 0; + } else if (!moea64_has_lp_4k_16m) { + printf("mmu_oea64: " + "HW does not support mixed 4KB/16MB page sizes. " + "Disabling superpages...\n"); + sp_enabled = 0; + } else + pagesizes[1] = SP_SIZE; + } +#endif + if (!hw_direct_map) { uma_zone_set_allocf(moea64_pvo_zone, moea64_uma_page_alloc); } @@ -1783,7 +1982,7 @@ vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) - return + return; powerpc_sync(); PV_PAGE_LOCK(m); @@ -1793,6 +1992,13 @@ PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { +#ifdef PPC_SUPERPAGES + if (PVO_IS_SP(pvo)) { + CTR1(KTR_PMAP, "%s: demote before remwr", + __func__); + moea64_sp_demote(pvo); + } +#endif pvo->pvo_pte.prot &= ~VM_PROT_WRITE; ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE); if (ret < 0) @@ -1841,6 +2047,9 @@ pmap_t pmap; uint64_t lo; + CTR3(KTR_PMAP, "%s: pa=%#jx, ma=%#x", + __func__, (uintmax_t)VM_PAGE_TO_PHYS(m), ma); + if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; @@ -1853,6 +2062,13 @@ pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) { +#ifdef PPC_SUPERPAGES + if (PVO_IS_SP(pvo)) { + CTR1(KTR_PMAP, + "%s: demote before set_memattr", __func__); + moea64_sp_demote(pvo); + } +#endif pvo->pvo_pte.pa &= ~LPTE_WIMG; pvo->pvo_pte.pa |= lo; refchg = moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE); @@ -1943,7 +2159,7 @@ pvo = moea64_pvo_find_va(kernel_pmap, va); KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR, va)); - pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo)); + pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(kernel_pmap); return (pa); } @@ -2269,7 +2485,7 @@ */ oldprot = pvo->pvo_pte.prot; pvo->pvo_pte.prot = prot; - pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); + pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); /* * If the PVO is in the page table, update mapping @@ -2284,7 +2500,7 @@ if ((pg->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(pg, PGA_EXECUTABLE); moea64_syncicache(pm, PVO_VADDR(pvo), - pvo->pvo_pte.pa & LPTE_RPGN, PAGE_SIZE); + PVO_PADDR(pvo), PAGE_SIZE); } /* @@ -2305,7 +2521,7 @@ moea64_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { - struct pvo_entry *pvo, *tpvo, key; + struct pvo_entry *pvo, key; CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm, sva, eva, prot); @@ -2321,8 +2537,20 @@ PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); - pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { - tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); + pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { +#ifdef PPC_SUPERPAGES + if (PVO_IS_SP(pvo)) { + if (moea64_sp_pvo_in_range(pvo, sva, eva)) { + pvo = moea64_sp_protect(pvo, prot); + continue; + } else { + CTR1(KTR_PMAP, "%s: demote before protect", + __func__); + moea64_sp_demote(pvo); + } + } +#endif moea64_pvo_protect(pm, pvo, prot); } PMAP_UNLOCK(pm); @@ -2423,13 +2651,48 @@ } } +static void +moea64_remove_locked(pmap_t pm, vm_offset_t sva, vm_offset_t eva, + struct pvo_dlist *tofree) +{ + struct pvo_entry *pvo, *tpvo, key; + + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + key.pvo_vaddr = sva; + for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); + pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { +#ifdef PPC_SUPERPAGES + if (PVO_IS_SP(pvo)) { + if (moea64_sp_pvo_in_range(pvo, sva, eva)) { + tpvo = moea64_sp_remove(pvo, tofree); + continue; + } else { + CTR1(KTR_PMAP, "%s: demote before remove", + __func__); + moea64_sp_demote(pvo); + } + } +#endif + tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); + + /* + * For locking reasons, remove this from the page table and + * pmap, but save delinking from the vm_page for a second + * pass + */ + moea64_pvo_remove_from_pmap(pvo); + SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink); + } +} + /* * Remove the given range of addresses from the specified map. */ void moea64_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva) { - struct pvo_entry *pvo, *tpvo, key; + struct pvo_entry *pvo; struct pvo_dlist tofree; /* @@ -2438,23 +2701,9 @@ if (pm->pm_stats.resident_count == 0) return; - key.pvo_vaddr = sva; - SLIST_INIT(&tofree); - PMAP_LOCK(pm); - for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); - pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { - tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); - - /* - * For locking reasons, remove this from the page table and - * pmap, but save delinking from the vm_page for a second - * pass - */ - moea64_pvo_remove_from_pmap(pvo); - SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink); - } + moea64_remove_locked(pm, sva, eva, &tofree); PMAP_UNLOCK(pm); while (!SLIST_EMPTY(&tofree)) { @@ -2484,8 +2733,16 @@ pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); wasdead = (pvo->pvo_vaddr & PVO_DEAD); - if (!wasdead) + if (!wasdead) { +#ifdef PPC_SUPERPAGES + if (PVO_IS_SP(pvo)) { + CTR1(KTR_PMAP, "%s: demote before remove_all", + __func__); + moea64_sp_demote(pvo); + } +#endif moea64_pvo_remove_from_pmap(pvo); + } moea64_pvo_remove_from_page_locked(pvo, m); if (!wasdead) LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink); @@ -2648,7 +2905,7 @@ /* Send RC bits to VM */ if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { - pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); + pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); if (pg != NULL) { refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); if (refchg & LPTE_CHG) @@ -2674,7 +2931,7 @@ /* * Update vm about page writeability/executability if managed */ - PV_LOCKASSERT(pvo->pvo_pte.pa & LPTE_RPGN); + PV_LOCKASSERT(PVO_PADDR(pvo)); if (pvo->pvo_vaddr & PVO_MANAGED) { if (m != NULL) { LIST_REMOVE(pvo, pvo_vlink); @@ -2692,13 +2949,15 @@ moea64_pvo_remove_from_page(struct pvo_entry *pvo) { vm_page_t pg = NULL; + vm_paddr_t pa; + pa = PVO_PADDR(pvo); if (pvo->pvo_vaddr & PVO_MANAGED) - pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); + pg = PHYS_TO_VM_PAGE(pa); - PV_LOCK(pvo->pvo_pte.pa & LPTE_RPGN); + PV_LOCK(pa); moea64_pvo_remove_from_page_locked(pvo, pg); - PV_UNLOCK(pvo->pvo_pte.pa & LPTE_RPGN); + PV_UNLOCK(pa); } static struct pvo_entry * @@ -2718,11 +2977,22 @@ struct pvo_entry *pvo; int64_t ret; boolean_t rv; +#ifdef PPC_SUPERPAGES + vm_page_t sp; +#endif /* * See if this bit is stored in the page already. + * + * For superpages, the bit is stored in the first vm page. */ - if (m->md.mdpg_attrs & ptebit) + if ((m->md.mdpg_attrs & ptebit) != 0 +#ifdef PPC_SUPERPAGES + || ((sp = PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(m) & ~SP_MASK)) != NULL && + (sp->md.mdpg_attrs & (ptebit | MDPG_ATTR_SP)) == + (ptebit | MDPG_ATTR_SP)) +#endif + ) return (TRUE); /* @@ -2733,6 +3003,23 @@ powerpc_sync(); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { +#ifdef PPC_SUPERPAGES + if (PVO_IS_SP(pvo)) { + ret = moea64_sp_query(pvo, ptebit); + /* + * If SP was not demoted, check its REF/CHG bits here. + */ + if (ret != -1) { + if ((ret & ptebit) != 0) { + rv = TRUE; + break; + } + continue; + } + /* else, fallthrough */ + } +#endif + ret = 0; /* @@ -2778,6 +3065,14 @@ count = 0; PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { +#ifdef PPC_SUPERPAGES + if (PVO_IS_SP(pvo)) { + if ((ret = moea64_sp_clear(pvo, m, ptebit)) != -1) { + count += ret; + continue; + } + } +#endif ret = 0; PMAP_LOCK(pvo->pvo_pmap); @@ -2810,7 +3105,7 @@ for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key); ppa < pa + size; ppa += PAGE_SIZE, pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) { - if (pvo == NULL || (pvo->pvo_pte.pa & LPTE_RPGN) != ppa) { + if (pvo == NULL || PVO_PADDR(pvo) != ppa) { error = EFAULT; break; } @@ -2886,7 +3181,7 @@ len = MIN(lim - va, sz); pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF); if (pvo != NULL && !(pvo->pvo_pte.pa & LPTE_I)) { - pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va & ADDR_POFF); + pa = PVO_PADDR(pvo) | (va & ADDR_POFF); moea64_syncicache(pm, va, pa, len); } va += len; @@ -2905,7 +3200,7 @@ extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1]; void -moea64_scan_init() +moea64_scan_init(void) { struct pvo_entry *pvo; vm_offset_t va; @@ -2968,7 +3263,7 @@ #ifdef __powerpc64__ static size_t -moea64_scan_pmap() +moea64_scan_pmap(void) { struct pvo_entry *pvo; vm_paddr_t pa, pa_end; @@ -3005,7 +3300,7 @@ } } - pa = pvo->pvo_pte.pa & LPTE_RPGN; + pa = PVO_PADDR(pvo); if (va & PVO_LARGE) { pa_end = pa + lpsize; @@ -3037,7 +3332,7 @@ #else static size_t -moea64_scan_pmap() +moea64_scan_pmap(void) { return (0); } @@ -3180,3 +3475,754 @@ DEFINE_OEA64_IFUNC(int64_t, pte_clear, (struct pvo_entry *, uint64_t), moea64_null_method) DEFINE_OEA64_IFUNC(int64_t, pte_synch, (struct pvo_entry *), moea64_null_method) + +#ifdef PPC_SUPERPAGES + +/* MMU interface */ + +static bool +moea64_ps_enabled(pmap_t pmap) +{ + return (sp_enabled); +} + +static void +moea64_align_superpage(vm_object_t object, vm_ooffset_t offset, + vm_offset_t *addr, vm_size_t size) +{ + vm_offset_t sp_offset; + + if (size < SP_SIZE) + return; + + CTR4(KTR_PMAP, "%s: offs=%#jx, addr=%p, size=%#jx", + __func__, (uintmax_t)offset, addr, (uintmax_t)size); + + if (object != NULL && (object->flags & OBJ_COLORED) != 0) + offset += ptoa(object->pg_color); + sp_offset = offset & SP_MASK; + if (size - ((SP_SIZE - sp_offset) & SP_MASK) < SP_SIZE || + (*addr & SP_MASK) == sp_offset) + return; + if ((*addr & SP_MASK) < sp_offset) + *addr = (*addr & ~SP_MASK) + sp_offset; + else + *addr = ((*addr + SP_MASK) & ~SP_MASK) + sp_offset; +} + +/* helpers */ + +static __inline void +moea64_pvo_cleanup(struct pvo_dlist *tofree) +{ + struct pvo_entry *pvo; + + /* clean up */ + while (!SLIST_EMPTY(tofree)) { + pvo = SLIST_FIRST(tofree); + SLIST_REMOVE_HEAD(tofree, pvo_dlink); + if (pvo->pvo_vaddr & PVO_DEAD) + moea64_pvo_remove_from_page(pvo); + free_pvo_entry(pvo); + } +} + +static __inline uint16_t +pvo_to_vmpage_flags(struct pvo_entry *pvo) +{ + uint16_t flags; + + flags = 0; + if ((pvo->pvo_pte.prot & VM_PROT_WRITE) != 0) + flags |= PGA_WRITEABLE; + if ((pvo->pvo_pte.prot & VM_PROT_EXECUTE) != 0) + flags |= PGA_EXECUTABLE; + + return (flags); +} + +/* + * Check if the given pvo and its superpage are in sva-eva range. + */ +static __inline bool +moea64_sp_pvo_in_range(struct pvo_entry *pvo, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t spva; + + spva = PVO_VADDR(pvo) & ~SP_MASK; + if (spva >= sva && spva + SP_SIZE <= eva) { + /* + * Because this function is intended to be called from loops + * that iterate over ordered pvo entries, if the condition + * above is true then the pvo must be the first of its + * superpage. + */ + KASSERT(PVO_VADDR(pvo) == spva, + ("%s: unexpected unaligned superpage pvo", __func__)); + return (true); + } + return (false); +} + +/* + * Update vm about the REF/CHG bits if the superpage is managed and + * has (or had) write access. + */ +static void +moea64_sp_refchg_process(struct pvo_entry *sp, vm_page_t m, + int64_t sp_refchg, vm_prot_t prot) +{ + vm_page_t m_end; + int64_t refchg; + + if ((sp->pvo_vaddr & PVO_MANAGED) != 0 && (prot & VM_PROT_WRITE) != 0) { + for (m_end = &m[SP_PAGES]; m < m_end; m++) { + refchg = sp_refchg | + atomic_readandclear_32(&m->md.mdpg_attrs); + if (refchg & LPTE_CHG) + vm_page_dirty(m); + if (refchg & LPTE_REF) + vm_page_aflag_set(m, PGA_REFERENCED); + } + } +} + +/* Superpage ops */ + +static int +moea64_sp_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, u_int flags, int8_t psind) +{ + struct pvo_entry *pvo, **pvos; + struct pvo_head *pvo_head; + vm_offset_t sva; + vm_page_t sm; + vm_paddr_t pa; + bool sync; + struct pvo_dlist tofree; + int error, i; + uint16_t aflags; + + KASSERT((va & SP_MASK) == 0, ("%s: va %#jx unaligned", + __func__, (uintmax_t)va)); + KASSERT(psind == 1, ("%s: invalid psind: %d", __func__, psind)); + KASSERT(m->psind == 1, ("%s: invalid m->psind: %d", + __func__, m->psind)); + KASSERT(pmap != kernel_pmap, + ("%s: function called with kernel pmap", __func__)); + + CTR5(KTR_PMAP, "%s: va=%#jx, pa=%#jx, prot=%#x, flags=%#x, psind=1", + __func__, (uintmax_t)va, (uintmax_t)VM_PAGE_TO_PHYS(m), + prot, flags); + + SLIST_INIT(&tofree); + + sva = va; + sm = m; + pa = VM_PAGE_TO_PHYS(sm); + + /* Try to allocate all PVOs first, to make failure handling easier. */ + pvos = malloc(SP_PAGES * sizeof(struct pvo_entry *), M_TEMP, M_NOWAIT); + if (pvos == NULL) { + CTR1(KTR_PMAP, "%s: failed to alloc pvo array", __func__); + return (KERN_RESOURCE_SHORTAGE); + } + + for (i = 0; i < SP_PAGES; i++) { + pvos[i] = alloc_pvo_entry(0); + if (pvos[i] == NULL) { + CTR1(KTR_PMAP, "%s: failed to alloc pvo", __func__); + for (i = i - 1; i >= 0; i--) + free_pvo_entry(pvos[i]); + free(pvos, M_TEMP); + return (KERN_RESOURCE_SHORTAGE); + } + } + + PV_PAGE_LOCK(sm); + PMAP_LOCK(pmap); + + /* Note: moea64_remove_locked() also clears cached REF/CHG bits. */ + moea64_remove_locked(pmap, va, va + SP_SIZE, &tofree); + + /* Enter pages */ + for (i = 0; i < SP_PAGES; + i++, va += PAGE_SIZE, pa += PAGE_SIZE, m++) { + pvo = pvos[i]; + + pvo->pvo_pte.prot = prot; + pvo->pvo_pte.pa = (pa & ~LPTE_LP_MASK) | LPTE_LP_4K_16M | + moea64_calc_wimg(pa, pmap_page_get_memattr(m)); + + if ((flags & PMAP_ENTER_WIRED) != 0) + pvo->pvo_vaddr |= PVO_WIRED; + pvo->pvo_vaddr |= PVO_LARGE; + + if ((m->oflags & VPO_UNMANAGED) != 0) + pvo_head = NULL; + else { + pvo_head = &m->md.mdpg_pvoh; + pvo->pvo_vaddr |= PVO_MANAGED; + } + + init_pvo_entry(pvo, pmap, va); + + error = moea64_pvo_enter(pvo, pvo_head, NULL); + /* + * All superpage PVOs were previously removed, so no errors + * should occur while inserting the new ones. + */ + KASSERT(error == 0, ("%s: unexpected error " + "when inserting superpage PVO: %d", + __func__, error)); + } + + PMAP_UNLOCK(pmap); + PV_PAGE_UNLOCK(sm); + + sync = (sm->a.flags & PGA_EXECUTABLE) == 0; + /* Note: moea64_pvo_cleanup() also clears page prot. flags. */ + moea64_pvo_cleanup(&tofree); + pvo = pvos[0]; + + /* Set vm page flags */ + aflags = pvo_to_vmpage_flags(pvo); + if (aflags != 0) + for (m = sm; m < &sm[SP_PAGES]; m++) + vm_page_aflag_set(m, aflags); + + /* + * Flush the page from the instruction cache if this page is + * mapped executable and cacheable. + */ + if (sync && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) + moea64_syncicache(pmap, sva, VM_PAGE_TO_PHYS(sm), SP_SIZE); + + atomic_add_long(&sp_mappings, 1); + CTR3(KTR_PMAP, "%s: SP success for va %#jx in pmap %p", + __func__, (uintmax_t)sva, pmap); + + free(pvos, M_TEMP); + return (KERN_SUCCESS); +} + +static int +moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m) +{ + struct pvo_entry *first, *pvo; + vm_paddr_t pa, pa_end; + vm_offset_t sva, va_end; + int64_t sp_refchg; + + /* Return if page promotion is not possible. */ + if ((m->flags & PG_FICTITIOUS) != 0 || + vm_reserv_level_iffullpop(m) != 0 || !moea64_ps_enabled(pmap)) + return (1); + + /* This CTR may generate a lot of output. */ + /* CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)va); */ + + va &= ~SP_MASK; + sva = va; + /* Get superpage */ + m = PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(m) & ~SP_MASK); + + PV_PAGE_LOCK(m); + PMAP_LOCK(pmap); + + /* + * Check if all pages meet promotion criteria. + * + * XXX In some cases the loop below may be executed for each or most + * of the entered pages of a superpage, which can be expensive + * (although it was not profiled) and need some optimization. + * + * Some cases where this seems to happen are: + * - When a superpage is first entered read-only and later becomes + * read-write. + * - When some of the superpage's virtual addresses map to previously + * wired/cached pages while others map to pages allocated from a + * different physical address range. A common scenario where this + * happens is when mmap'ing a file that is already present in FS + * block cache and doesn't fill a superpage. + */ + first = pvo = moea64_pvo_find_va(pmap, sva); + for (pa = VM_PAGE_TO_PHYS(m), pa_end = pa + SP_SIZE; + pa < pa_end; pa += PAGE_SIZE, va += PAGE_SIZE) { + if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD) != 0) { + CTR3(KTR_PMAP, + "%s: null or dead pvo for va %#jx pmap %p", + __func__, (uintmax_t)va, pmap); + goto error; + } + + if (PVO_PADDR(pvo) != pa) { + CTR3(KTR_PMAP, "%s: failure for va %#jx pmap %p " + "(pa doesn't match)", + __func__, (uintmax_t)va, pmap); + goto error; + } + + if ((first->pvo_vaddr & PVO_PROMOTE) != + (pvo->pvo_vaddr & PVO_PROMOTE) || + first->pvo_pte.prot != pvo->pvo_pte.prot || + (first->pvo_pte.pa & LPTE_WIMG) != + (pvo->pvo_pte.pa & LPTE_WIMG)) { + /* This CTR may generate a lot of output. */ + /* + CTR3(KTR_PMAP, "%s: failure for va %#jx pmap %p " + "(pvo flags don't match)", + __func__, (uintmax_t)va, pmap); + */ + goto error; + } + + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo); + } + + /* All OK, promote. */ + + /* + * Handle superpage REF/CHG bits. If REF or CHG is set in + * any page, then it must be set in the superpage. + * + * Instead of querying each page, we take advantage of two facts: + * 1- If a page is being promoted, it was referenced. + * 2- If promoted pages are writable, they were modified. + */ + sp_refchg = LPTE_REF | + ((first->pvo_pte.prot & VM_PROT_WRITE) != 0 ? LPTE_CHG : 0); + + /* Promote pages */ + pvo = first; + for (va = PVO_VADDR(pvo), va_end = va + SP_SIZE; + va < va_end; va += PAGE_SIZE) { + KASSERT(pvo && PVO_VADDR(pvo) == va, + ("%s: missing PVO for va %#jx", __func__, (uintmax_t)va)); + pvo->pvo_pte.pa &= ~LPTE_LP_MASK; + pvo->pvo_pte.pa |= LPTE_LP_4K_16M; + pvo->pvo_vaddr |= PVO_LARGE; + + moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE); + + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo); + } + + /* Send REF/CHG bits to VM */ + moea64_sp_refchg_process(first, m, sp_refchg, first->pvo_pte.prot); + + /* Use first page to cache REF/CHG bits */ + atomic_set_32(&m->md.mdpg_attrs, sp_refchg | MDPG_ATTR_SP); + + PMAP_UNLOCK(pmap); + PV_PAGE_UNLOCK(m); + + atomic_add_long(&sp_mappings, 1); + atomic_add_long(&sp_promotions, 1); + CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p", + __func__, (uintmax_t)sva, pmap); + + return (0); + +error: + atomic_add_long(&sp_p_failures, 1); + PMAP_UNLOCK(pmap); + PV_PAGE_UNLOCK(m); + return (1); +} + +static void +moea64_sp_demote_aligned(struct pvo_entry *sp) +{ + struct pvo_entry *pvo; + vm_offset_t va, va_end; + vm_paddr_t pa; + vm_page_t m; + pmap_t pmap; + int64_t ret, refchg; + + CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp)); + + pmap = sp->pvo_pmap; + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + pvo = sp; + pa = PVO_PADDR(pvo); + m = PHYS_TO_VM_PAGE(pa); + refchg = 0; + + /* Demote pages */ + for (va = PVO_VADDR(pvo), va_end = va + SP_SIZE; + va < va_end; va += PAGE_SIZE, pa += PAGE_SIZE) { + KASSERT(pvo && PVO_VADDR(pvo) == va, + ("%s: missing PVO for va %#jx", __func__, (uintmax_t)va)); + pvo->pvo_vaddr &= ~PVO_LARGE; + pvo->pvo_pte.pa &= ~LPTE_RPGN; + pvo->pvo_pte.pa |= pa; + + ret = moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE); + if (ret < 0) + refchg |= LPTE_CHG; + else + refchg |= ret; + + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo); + } + + /* Clear SP flag */ + atomic_clear_32(&m->md.mdpg_attrs, MDPG_ATTR_SP); + + /* + * Handle superpage REF/CHG bits. A bit set in the superpage + * means all pages should consider it set. + */ + moea64_sp_refchg_process(sp, m, refchg, sp->pvo_pte.prot); + + atomic_add_long(&sp_demotions, 1); + CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p", + __func__, (uintmax_t)PVO_VADDR(sp), pmap); +} + +static void +moea64_sp_demote(struct pvo_entry *pvo) +{ + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + + if ((PVO_VADDR(pvo) & SP_MASK) != 0) { + pvo = moea64_pvo_find_va(pvo->pvo_pmap, + PVO_VADDR(pvo) & ~SP_MASK); + KASSERT(pvo != NULL, ("%s: missing PVO for va %#jx", + __func__, (uintmax_t)(PVO_VADDR(pvo) & ~SP_MASK))); + } + moea64_sp_demote_aligned(pvo); +} + +static struct pvo_entry * +moea64_sp_unwire(struct pvo_entry *sp) +{ + struct pvo_entry *pvo, *prev; + vm_offset_t eva; + pmap_t pm; + int64_t ret, refchg; + + CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp)); + + pm = sp->pvo_pmap; + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + eva = PVO_VADDR(sp) + SP_SIZE; + refchg = 0; + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; + prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + if ((pvo->pvo_vaddr & PVO_WIRED) == 0) + panic("%s: pvo %p is missing PVO_WIRED", + __func__, pvo); + pvo->pvo_vaddr &= ~PVO_WIRED; + + ret = moea64_pte_replace(pvo, 0 /* No invalidation */); + if (ret < 0) + refchg |= LPTE_CHG; + else + refchg |= ret; + + pm->pm_stats.wired_count--; + } + + /* Send REF/CHG bits to VM */ + moea64_sp_refchg_process(sp, PHYS_TO_VM_PAGE(PVO_PADDR(sp)), + refchg, sp->pvo_pte.prot); + + return (prev); +} + +static struct pvo_entry * +moea64_sp_protect(struct pvo_entry *sp, vm_prot_t prot) +{ + struct pvo_entry *pvo, *prev; + vm_offset_t eva; + pmap_t pm; + vm_page_t m, m_end; + int64_t ret, refchg; + vm_prot_t oldprot; + + CTR3(KTR_PMAP, "%s: va=%#jx, prot=%x", + __func__, (uintmax_t)PVO_VADDR(sp), prot); + + pm = sp->pvo_pmap; + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + oldprot = sp->pvo_pte.prot; + m = PHYS_TO_VM_PAGE(PVO_PADDR(sp)); + KASSERT(m != NULL, ("%s: missing vm page for pa %#jx", + __func__, (uintmax_t)PVO_PADDR(sp))); + eva = PVO_VADDR(sp) + SP_SIZE; + refchg = 0; + + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; + prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + pvo->pvo_pte.prot = prot; + /* + * If the PVO is in the page table, update mapping + */ + ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE); + if (ret < 0) + refchg |= LPTE_CHG; + else + refchg |= ret; + } + + /* Send REF/CHG bits to VM */ + moea64_sp_refchg_process(sp, m, refchg, oldprot); + + /* Handle pages that became executable */ + if ((m->a.flags & PGA_EXECUTABLE) == 0 && + (sp->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { + if ((m->oflags & VPO_UNMANAGED) == 0) + for (m_end = &m[SP_PAGES]; m < m_end; m++) + vm_page_aflag_set(m, PGA_EXECUTABLE); + moea64_syncicache(pm, PVO_VADDR(sp), PVO_PADDR(sp), SP_SIZE); + } + + return (prev); +} + +static struct pvo_entry * +moea64_sp_remove(struct pvo_entry *sp, struct pvo_dlist *tofree) +{ + struct pvo_entry *pvo, *tpvo; + vm_offset_t eva; + pmap_t pm; + + CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp)); + + pm = sp->pvo_pmap; + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + eva = PVO_VADDR(sp) + SP_SIZE; + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { + tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); + + /* + * For locking reasons, remove this from the page table and + * pmap, but save delinking from the vm_page for a second + * pass + */ + moea64_pvo_remove_from_pmap(pvo); + SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink); + } + + /* Clear SP bit */ + atomic_clear_32(&PHYS_TO_VM_PAGE(PVO_PADDR(sp))->md.mdpg_attrs, + MDPG_ATTR_SP); + + return (tpvo); +} + +static int64_t +moea64_sp_query_locked(struct pvo_entry *pvo, uint64_t ptebit) +{ + int64_t refchg, ret; + vm_offset_t eva; + vm_page_t m; + pmap_t pmap; + struct pvo_entry *sp; + + pmap = pvo->pvo_pmap; + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* Get first SP PVO */ + if ((PVO_VADDR(pvo) & SP_MASK) != 0) { + sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~SP_MASK); + KASSERT(sp != NULL, ("%s: missing PVO for va %#jx", + __func__, (uintmax_t)(PVO_VADDR(pvo) & ~SP_MASK))); + } else + sp = pvo; + eva = PVO_VADDR(sp) + SP_SIZE; + + refchg = 0; + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) { + ret = moea64_pte_synch(pvo); + if (ret > 0) { + refchg |= ret & (LPTE_CHG | LPTE_REF); + if ((refchg & ptebit) != 0) + break; + } + } + + /* Save results */ + if (refchg != 0) { + m = PHYS_TO_VM_PAGE(PVO_PADDR(sp)); + atomic_set_32(&m->md.mdpg_attrs, refchg | MDPG_ATTR_SP); + } + + return (refchg); +} + +static int64_t +moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit) +{ + int64_t refchg; + pmap_t pmap; + + pmap = pvo->pvo_pmap; + PMAP_LOCK(pmap); + + /* + * Check if SP was demoted/removed before pmap lock was acquired. + */ + if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) { + CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx", + __func__, (uintmax_t)PVO_PADDR(pvo)); + PMAP_UNLOCK(pmap); + return (-1); + } + + refchg = moea64_sp_query_locked(pvo, ptebit); + PMAP_UNLOCK(pmap); + + CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx", + __func__, (uintmax_t)PVO_VADDR(pvo), + (uintmax_t)PVO_PADDR(pvo), (uintmax_t)refchg); + + return (refchg); +} + +static int64_t +moea64_sp_pvo_clear(struct pvo_entry *pvo, uint64_t ptebit) +{ + int64_t refchg, ret; + pmap_t pmap; + struct pvo_entry *sp; + vm_offset_t eva; + vm_page_t m; + + pmap = pvo->pvo_pmap; + PMAP_LOCK(pmap); + + /* + * Check if SP was demoted/removed before pmap lock was acquired. + */ + if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) { + CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx", + __func__, (uintmax_t)PVO_PADDR(pvo)); + PMAP_UNLOCK(pmap); + return (-1); + } + + /* Get first SP PVO */ + if ((PVO_VADDR(pvo) & SP_MASK) != 0) { + sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~SP_MASK); + KASSERT(sp != NULL, ("%s: missing PVO for va %#jx", + __func__, (uintmax_t)(PVO_VADDR(pvo) & ~SP_MASK))); + } else + sp = pvo; + eva = PVO_VADDR(sp) + SP_SIZE; + + refchg = 0; + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) { + ret = moea64_pte_clear(pvo, ptebit); + if (ret > 0) + refchg |= ret & (LPTE_CHG | LPTE_REF); + } + + m = PHYS_TO_VM_PAGE(PVO_PADDR(sp)); + atomic_clear_32(&m->md.mdpg_attrs, ptebit); + PMAP_UNLOCK(pmap); + + CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx", + __func__, (uintmax_t)PVO_VADDR(sp), + (uintmax_t)PVO_PADDR(sp), (uintmax_t)refchg); + + return (refchg); +} + +static int64_t +moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m, uint64_t ptebit) +{ + int64_t count, ret; + pmap_t pmap; + + count = 0; + pmap = pvo->pvo_pmap; + + /* + * Since this reference bit is shared by 4096 4KB pages, it + * should not be cleared every time it is tested. Apply a + * simple "hash" function on the physical page number, the + * virtual superpage number, and the pmap address to select + * one 4KB page out of the 4096 on which testing the + * reference bit will result in clearing that reference bit. + * This function is designed to avoid the selection of the + * same 4KB page for every 16MB page mapping. + * + * Always leave the reference bit of a wired mapping set, as + * the current state of its reference bit won't affect page + * replacement. + */ + if (ptebit == LPTE_REF && (((VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) ^ + (PVO_VADDR(pvo) >> SP_SHIFT) ^ (uintptr_t)pmap) & + (SP_PAGES - 1)) == 0 && (pvo->pvo_vaddr & PVO_WIRED) == 0) { + if ((ret = moea64_sp_pvo_clear(pvo, ptebit)) == -1) + return (-1); + + if ((ret & ptebit) != 0) + count++; + + /* + * If this page was not selected by the hash function, then assume + * its REF bit was set. + */ + } else if (ptebit == LPTE_REF) { + count++; + + /* + * To clear the CHG bit of a single SP page, first it must be demoted. + * But if no CHG bit is set, no bit clear and thus no SP demotion is + * needed. + */ + } else { + CTR4(KTR_PMAP, "%s: ptebit=%#jx, va=%#jx, pa=%#jx", + __func__, (uintmax_t)ptebit, (uintmax_t)PVO_VADDR(pvo), + (uintmax_t)PVO_PADDR(pvo)); + + PMAP_LOCK(pmap); + + /* + * Make sure SP wasn't demoted/removed before pmap lock + * was acquired. + */ + if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) { + CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx", + __func__, (uintmax_t)PVO_PADDR(pvo)); + PMAP_UNLOCK(pmap); + return (-1); + } + + ret = moea64_sp_query_locked(pvo, ptebit); + if ((ret & ptebit) != 0) + count++; + else { + PMAP_UNLOCK(pmap); + return (0); + } + + moea64_sp_demote(pvo); + moea64_pte_clear(pvo, ptebit); + + /* + * Write protect the mapping to a single page so that a + * subsequent write access may repromote. + */ + if ((pvo->pvo_vaddr & PVO_WIRED) == 0) + moea64_pvo_protect(pmap, pvo, + pvo->pvo_pte.prot & ~VM_PROT_WRITE); + + PMAP_UNLOCK(pmap); + } + + return (count); +} + +#endif Index: sys/powerpc/aim/moea64_native.c =================================================================== --- sys/powerpc/aim/moea64_native.c +++ sys/powerpc/aim/moea64_native.c @@ -132,11 +132,26 @@ /* POWER9 only permits a 64k partition table size. */ #define PART_SIZE 0x10000 +/* + * Actual page sizes (to be used with tlbie, when L=0) + * + * Kernel currently uses large pages (L=1, 16M base/16M actual page size) + * for DMAP only, that, unlike user pages, are entered and not touched + * afterwise. Pages used by userspace have a base size of 4K and may be + * promoted to 16M large pages (L=0, 4K base/16M actual page size, a.k.a. + * superpages), resulting in a mix of 4K and 16M pages. As operations with + * large kernel pages don't call TLBIE, it is safe to assume that if + * LPTE_BIG is set, we are dealing with a 4K base/16M actual page + * (i.e., AP_16M). + */ +#define AP_4K 0x00 +#define AP_16M 0x80 + static bool moea64_crop_tlbie; static bool moea64_need_lock; static __inline void -TLBIE(uint64_t vpn) { +TLBIE(uint64_t vpn, uint64_t ap) { #ifndef __powerpc64__ register_t vpn_hi, vpn_lo; register_t msr; @@ -168,7 +183,7 @@ * in the newer form is in the same position as the L(page size) bit of * the old form, so a slong as RS is 0, we're good on both sides. */ - __asm __volatile("li 0, 0 \n tlbie %0" :: "r"(vpn) : "r0", "memory"); + __asm __volatile("li 0, 0 \n tlbie %0" :: "r"(vpn | ap) : "r0", "memory"); __asm __volatile("eieio; tlbsync; ptesync" ::: "memory"); #else vpn_hi = (uint32_t)(vpn >> 32); @@ -321,7 +336,8 @@ rw_runlock(&moea64_eviction_lock); critical_enter(); - TLBIE(pvo->pvo_vpn); + TLBIE(pvo->pvo_vpn, + properpt.pte_hi & LPTE_BIG ? AP_16M : AP_4K); critical_exit(); } else { rw_runlock(&moea64_eviction_lock); @@ -356,7 +372,7 @@ critical_enter(); pt->pte_hi = be64toh((pt->pte_hi & ~LPTE_VALID) | LPTE_LOCKED); PTESYNC(); - TLBIE(pvo->pvo_vpn); + TLBIE(pvo->pvo_vpn, pt->pte_hi & LPTE_BIG ? AP_16M : AP_4K); ptelo = be64toh(pt->pte_lo); *((volatile int32_t *)(&pt->pte_hi) + 1) = 0; /* Release lock */ critical_exit(); @@ -394,7 +410,7 @@ critical_enter(); pt->pte_hi = be64toh((pt->pte_hi & ~LPTE_VALID) | LPTE_LOCKED); PTESYNC(); - TLBIE(pvo->pvo_vpn); + TLBIE(pvo->pvo_vpn, pt->pte_hi & LPTE_BIG ? AP_16M : AP_4K); ptelo = be64toh(pt->pte_lo); EIEIO(); pt->pte_lo = htobe64(properpt.pte_lo); @@ -702,7 +718,7 @@ va |= (oldptehi & LPTE_AVPN_MASK) << (ADDR_API_SHFT64 - ADDR_PIDX_SHFT); PTESYNC(); - TLBIE(va); + TLBIE(va, oldptehi & LPTE_BIG ? AP_16M : AP_4K); STAT_MOEA64(moea64_pte_valid--); STAT_MOEA64(moea64_pte_overflow++); } Index: sys/powerpc/include/param.h =================================================================== --- sys/powerpc/include/param.h +++ sys/powerpc/include/param.h @@ -120,6 +120,15 @@ #define L3_PAGE_SIZE (1UL<pvo_vaddr & ~ADDR_POFF) #define PVO_PTEGIDX_GET(pvo) ((pvo)->pvo_vaddr & PVO_PTEGIDX_MASK) #define PVO_PTEGIDX_ISSET(pvo) ((pvo)->pvo_vaddr & PVO_PTEGIDX_VALID) Index: sys/powerpc/include/pte.h =================================================================== --- sys/powerpc/include/pte.h +++ sys/powerpc/include/pte.h @@ -120,8 +120,13 @@ #define LPTE_VALID 0x0000000000000001ULL /* Low quadword: */ +#define LP_4K_16M 0x38 /* 4KB base, 16MB actual page size */ + #define EXTEND_PTE(x) UINT64_C(x) /* make constants 64-bit */ #define LPTE_RPGN 0xfffffffffffff000ULL +#define LPTE_LP_MASK 0x00000000000ff000ULL +#define LPTE_LP_SHIFT 12 +#define LPTE_LP_4K_16M ((unsigned long long)(LP_4K_16M) << LPTE_LP_SHIFT) #define LPTE_REF EXTEND_PTE( PTE_REF ) #define LPTE_CHG EXTEND_PTE( PTE_CHG ) #define LPTE_WIMG EXTEND_PTE( PTE_WIMG ) Index: sys/powerpc/include/slb.h =================================================================== --- sys/powerpc/include/slb.h +++ sys/powerpc/include/slb.h @@ -64,6 +64,14 @@ #define SLBE_ESID_MASK 0xfffffffff0000000UL /* Effective segment ID mask */ #define SLBE_ESID_SHIFT 28 +/* + * SLB page sizes encoding, as present in property ibm,segment-page-sizes + * of CPU device tree node. + * + * See LoPAPR: CPU Node Properties, section C.6.1.4. + */ +#define SLB_PGSZ_4K_4K 0 + /* Virtual real-mode VSID in LPARs */ #define VSID_VRMA 0x1ffffff Index: sys/powerpc/include/vmparam.h =================================================================== --- sys/powerpc/include/vmparam.h +++ sys/powerpc/include/vmparam.h @@ -186,30 +186,41 @@ #define VM_FREELIST_DEFAULT 0 /* - * The largest allocation size is 4MB. + * By default, enable superpages for PPC64, except for BOOKE (that uses + * a different MMU). */ +#if defined(__powerpc64__) && !defined(BOOKE) +#define PPC_SUPERPAGES +#endif + #ifdef __powerpc64__ +/* The largest allocation size is 16MB. */ #define VM_NFREEORDER 13 #else +/* The largest allocation size is 4MB. */ #define VM_NFREEORDER 11 #endif #ifndef VM_NRESERVLEVEL #ifdef __powerpc64__ +/* Enable superpage reservations: 1 level. */ #define VM_NRESERVLEVEL 1 #else -/* - * Disable superpage reservations. - */ +/* Disable superpage reservations. */ #define VM_NRESERVLEVEL 0 #endif #endif -/* - * Level 0 reservations consist of 512 pages. - */ #ifndef VM_LEVEL_0_ORDER -#define VM_LEVEL_0_ORDER 9 +/* Level 0 reservations consist of 512 (RPT) or 4096 (HPT) pages. */ +#define VM_LEVEL_0_ORDER vm_level_0_order +#ifndef __ASSEMBLER__ +extern int vm_level_0_order; +#endif +#endif + +#ifndef VM_LEVEL_0_ORDER_MAX +#define VM_LEVEL_0_ORDER_MAX 12 #endif #ifdef __powerpc64__ Index: sys/powerpc/powernv/platform_powernv.c =================================================================== --- sys/powerpc/powernv/platform_powernv.c +++ sys/powerpc/powernv/platform_powernv.c @@ -142,6 +142,7 @@ phandle_t opal; int res, len, idx; register_t msr; + bool has_lp; /* Ping OPAL again just to make sure */ opal_check(); @@ -225,6 +226,7 @@ sizeof(arr)); len /= 4; idx = 0; + has_lp = false; while (len > 0) { shift = arr[idx]; slb_encoding = arr[idx + 1]; @@ -235,17 +237,21 @@ lp_size = arr[idx]; lp_encoding = arr[idx+1]; if (slb_encoding == SLBV_L && lp_encoding == 0) - break; + has_lp = true; + + if (slb_encoding == SLB_PGSZ_4K_4K && + lp_encoding == LP_4K_16M) + moea64_has_lp_4k_16m = true; idx += 2; len -= 2; nptlp--; } - if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0) + if (has_lp && moea64_has_lp_4k_16m) break; } - if (len == 0) + if (!has_lp) panic("Standard large pages (SLB[L] = 1, PTE[LP] = 0) " "not supported by this system."); Index: sys/powerpc/powerpc/pmap_dispatch.c =================================================================== --- sys/powerpc/powerpc/pmap_dispatch.c +++ sys/powerpc/powerpc/pmap_dispatch.c @@ -77,6 +77,8 @@ caddr_t crashdumpmap; int pmap_bootstrapped; +/* Default level 0 reservations consist of 512 pages (2MB superpage). */ +int vm_level_0_order = 9; #ifdef AIM int Index: sys/powerpc/pseries/mmu_phyp.c =================================================================== --- sys/powerpc/pseries/mmu_phyp.c +++ sys/powerpc/pseries/mmu_phyp.c @@ -135,6 +135,7 @@ uint64_t vsid; phandle_t dev, node, root; int idx, len, res; + bool has_lp; rm_init(&mphyp_eviction_lock, "pte eviction"); @@ -199,6 +200,7 @@ sizeof(arr)); len /= 4; idx = 0; + has_lp = false; while (len > 0) { shift = arr[idx]; slb_encoding = arr[idx + 1]; @@ -220,18 +222,22 @@ lp_encoding); if (slb_encoding == SLBV_L && lp_encoding == 0) - break; + has_lp = true; + + if (slb_encoding == SLB_PGSZ_4K_4K && + lp_encoding == LP_4K_16M) + moea64_has_lp_4k_16m = true; idx += 2; len -= 2; nptlp--; } dprintf("\n"); - if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0) + if (has_lp && moea64_has_lp_4k_16m) break; } - if (len > 0) { + if (has_lp) { moea64_large_page_shift = shift; moea64_large_page_size = 1ULL << lp_size; moea64_large_page_mask = moea64_large_page_size - 1; Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -497,7 +497,8 @@ pidx += npages, m = vm_page_next(&m[npages - 1])) { vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset; #if defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ - __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv) + __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv) || \ + defined(__powerpc64__) psind = m->psind; if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 || pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last ||