Page MenuHomeFreeBSD

D25237.id73228.diff
No OneTemporary

D25237.id73228.diff

Index: sys/powerpc/aim/mmu_oea64.h
===================================================================
--- sys/powerpc/aim/mmu_oea64.h
+++ sys/powerpc/aim/mmu_oea64.h
@@ -129,6 +129,7 @@
extern u_long moea64_pteg_count;
extern u_long moea64_pteg_mask;
extern int n_slbs;
+extern bool moea64_has_lp_4k_16m;
#endif /* _POWERPC_AIM_MMU_OEA64_H */
Index: sys/powerpc/aim/mmu_oea64.c
===================================================================
--- sys/powerpc/aim/mmu_oea64.c
+++ sys/powerpc/aim/mmu_oea64.c
@@ -80,6 +80,7 @@
#include <vm/vm_object.h>
#include <vm/vm_extern.h>
#include <vm/vm_pageout.h>
+#include <vm/vm_reserv.h>
#include <vm/uma.h>
#include <machine/_inttypes.h>
@@ -228,6 +229,7 @@
uint64_t moea64_large_page_mask = 0;
uint64_t moea64_large_page_size = 0;
int moea64_large_page_shift = 0;
+bool moea64_has_lp_4k_16m = false;
/*
* PVO calls.
@@ -249,6 +251,96 @@
static void moea64_syncicache(pmap_t pmap, vm_offset_t va,
vm_paddr_t pa, vm_size_t sz);
static void moea64_pmap_init_qpages(void);
+static void moea64_remove_locked(pmap_t, vm_offset_t,
+ vm_offset_t, struct pvo_dlist *);
+
+/*
+ * Superpages data and routines.
+ */
+#define SP_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT)
+#define SP_SIZE (1 << SP_SHIFT)
+#define SP_MASK (SP_SIZE - 1)
+#define SP_PAGES (1 << VM_LEVEL_0_ORDER)
+
+/* PVO (vaddr) bits that must match for promotion to succeed. */
+#define PVO_PROMOTE (PVO_WIRED | PVO_MANAGED | PVO_LARGE | \
+ PVO_PTEGIDX_VALID)
+
+#define PVO_IS_SP(pvo) (((pvo)->pvo_vaddr & PVO_LARGE) && \
+ (pvo)->pvo_pmap != kernel_pmap)
+
+/* Get physical address from PVO. */
+#define PVO_PADDR(pvo) moea64_pvo_paddr(pvo)
+
+/* MD page flag indicating that the page is a superpage. */
+#define MDPG_ATTR_SP 0x40000000
+
+static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0,
+ "VM/pmap parameters");
+
+static int pg_ps_enabled = 0;
+SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN,
+ &pg_ps_enabled, 0, "Enable support for transparent superpages");
+
+static SYSCTL_NODE(_vm_pmap, OID_AUTO, sp, CTLFLAG_RD, 0,
+ "SP page mapping counters");
+
+static u_long sp_demotions;
+SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, demotions, CTLFLAG_RD,
+ &sp_demotions, 0, "SP page demotions");
+
+static u_long sp_mappings;
+SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, mappings, CTLFLAG_RD,
+ &sp_mappings, 0, "SP page mappings");
+
+static u_long sp_p_failures;
+SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_failures, CTLFLAG_RD,
+ &sp_p_failures, 0, "SP page promotion failures");
+
+static u_long sp_p_fail_pa;
+SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_pa, CTLFLAG_RD,
+ &sp_p_fail_pa, 0, "SP page promotion failure: PAs don't match");
+
+static u_long sp_p_fail_flags;
+SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_flags, CTLFLAG_RD,
+ &sp_p_fail_flags, 0, "SP page promotion failure: page flags don't match");
+
+static u_long sp_p_fail_prot;
+SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_prot, CTLFLAG_RD,
+ &sp_p_fail_prot, 0,
+ "SP page promotion failure: page protections don't match");
+
+static u_long sp_p_fail_wimg;
+SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_wimg, CTLFLAG_RD,
+ &sp_p_fail_wimg, 0, "SP page promotion failure: WIMG bits don't match");
+
+static u_long sp_promotions;
+SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, promotions, CTLFLAG_RD,
+ &sp_promotions, 0, "SP page promotions");
+
+static bool moea64_ps_enabled(pmap_t);
+static void moea64_align_superpage(vm_object_t, vm_ooffset_t,
+ vm_offset_t *, vm_size_t);
+
+static int moea64_sp_enter(pmap_t pmap, vm_offset_t va,
+ vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind);
+static struct pvo_entry *moea64_sp_remove(struct pvo_entry *sp,
+ struct pvo_dlist *tofree);
+
+static int moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m);
+static void moea64_sp_demote_aligned(struct pvo_entry *sp);
+static void moea64_sp_demote(struct pvo_entry *pvo);
+
+static struct pvo_entry *moea64_sp_unwire(struct pvo_entry *sp);
+static struct pvo_entry *moea64_sp_protect(struct pvo_entry *sp,
+ vm_prot_t prot);
+
+static int64_t moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit);
+static int64_t moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m,
+ uint64_t ptebit);
+
+static __inline bool moea64_sp_pvo_in_range(struct pvo_entry *pvo,
+ vm_offset_t sva, vm_offset_t eva);
/*
* Kernel MMU interface
@@ -355,6 +447,8 @@
#ifdef __powerpc64__
.page_array_startup = moea64_page_array_startup,
#endif
+ .ps_enabled = moea64_ps_enabled,
+ .align_superpage = moea64_align_superpage,
/* Internal interfaces */
.mapdev = moea64_mapdev,
@@ -374,6 +468,26 @@
MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods);
+/*
+ * Get physical address from PVO.
+ *
+ * For superpages, the lower bits are not stored on pvo_pte.pa and must be
+ * obtained from VA.
+ */
+static __inline vm_paddr_t
+moea64_pvo_paddr(struct pvo_entry *pvo)
+{
+ vm_paddr_t pa;
+
+ pa = (pvo)->pvo_pte.pa & LPTE_RPGN;
+
+ if (PVO_IS_SP(pvo)) {
+ pa &= ~SP_MASK; /* This is needed to clear LPTE_LP bits. */
+ pa |= PVO_VADDR(pvo) & SP_MASK;
+ }
+ return (pa);
+}
+
static struct pvo_head *
vm_page_to_pvoh(vm_page_t m)
{
@@ -422,8 +536,10 @@
pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT)
| (vsid << 16);
- shift = (pvo->pvo_vaddr & PVO_LARGE) ? moea64_large_page_shift :
- ADDR_PIDX_SHFT;
+ if (pmap == kernel_pmap && (pvo->pvo_vaddr & PVO_LARGE) != 0)
+ shift = moea64_large_page_shift;
+ else
+ shift = ADDR_PIDX_SHFT;
hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift);
pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3;
}
@@ -767,6 +883,9 @@
vm_paddr_t kernelphysstart, kernelphysend;
int rm_pavail;
+ /* Level 0 reservations consist of 4096 pages (16MB superpage). */
+ vm_level_0_order = 12;
+
#ifndef __powerpc64__
/* We don't have a direct map since there is no BAT */
hw_direct_map = 0;
@@ -1198,6 +1317,17 @@
for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
pvo != NULL && PVO_VADDR(pvo) < eva;
pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
+ if (PVO_IS_SP(pvo)) {
+ if (moea64_sp_pvo_in_range(pvo, sva, eva)) {
+ pvo = moea64_sp_unwire(pvo);
+ continue;
+ } else {
+ CTR1(KTR_PMAP, "%s: demote before unwire",
+ __func__);
+ moea64_sp_demote(pvo);
+ }
+ }
+
if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
panic("moea64_unwire: pvo %p is missing PVO_WIRED",
pvo);
@@ -1207,7 +1337,7 @@
(pvo->pvo_pte.prot & VM_PROT_WRITE)) {
if (refchg < 0)
refchg = LPTE_CHG;
- m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN);
+ m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
refchg |= atomic_readandclear_32(&m->md.mdpg_attrs);
if (refchg & LPTE_CHG)
@@ -1438,7 +1568,7 @@
moea64_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
vm_prot_t prot, u_int flags, int8_t psind)
{
- struct pvo_entry *pvo, *oldpvo;
+ struct pvo_entry *pvo, *oldpvo, *tpvo;
struct pvo_head *pvo_head;
uint64_t pte_lo;
int error;
@@ -1450,6 +1580,9 @@
VM_OBJECT_ASSERT_LOCKED(m->object);
}
+ if (psind > 0)
+ return (moea64_sp_enter(pmap, va, m, prot, flags, psind));
+
pvo = alloc_pvo_entry(0);
if (pvo == NULL)
return (KERN_RESOURCE_SHORTAGE);
@@ -1473,6 +1606,15 @@
PMAP_LOCK(pmap);
if (pvo->pvo_pmap == NULL)
init_pvo_entry(pvo, pmap, va);
+
+ tpvo = moea64_pvo_find_va(pmap, va & ~SP_MASK);
+ if (tpvo && PVO_IS_SP(tpvo)) {
+ /* Demote SP before entering a regular page */
+ CTR2(KTR_PMAP, "%s: demote before enter: va=%#jx",
+ __func__, (uintmax_t)va);
+ moea64_sp_demote_aligned(tpvo);
+ }
+
if (prot & VM_PROT_WRITE)
if (pmap_bootstrapped &&
(m->oflags & VPO_UNMANAGED) == 0)
@@ -1496,6 +1638,7 @@
PV_PAGE_UNLOCK(m);
PMAP_UNLOCK(pmap);
free_pvo_entry(pvo);
+ pvo = NULL;
goto out;
} else {
/* Otherwise, need to kill it first */
@@ -1524,6 +1667,19 @@
vm_page_aflag_set(m, PGA_EXECUTABLE);
moea64_syncicache(pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE);
}
+
+ /*
+ * Try to promote pages.
+ *
+ * If the VA of the entered page is not aligned with its PA,
+ * don't try page promotion as it is not possible.
+ * This reduces the number of promotion failures dramatically.
+ */
+ if (pmap != kernel_pmap && pvo != NULL &&
+ (pvo->pvo_vaddr & PVO_MANAGED) != 0 &&
+ (va & SP_MASK) == (VM_PAGE_TO_PHYS(m) & SP_MASK))
+ moea64_sp_promote(pmap, va, m);
+
return (KERN_SUCCESS);
}
@@ -1582,15 +1738,25 @@
{
vm_page_t m;
vm_pindex_t diff, psize;
+ vm_offset_t va;
+ int8_t psind;
VM_OBJECT_ASSERT_LOCKED(m_start->object);
psize = atop(end - start);
m = m_start;
while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
- moea64_enter(pm, start + ptoa(diff), m, prot &
- (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP |
- PMAP_ENTER_QUICK_LOCKED, 0);
+ va = start + ptoa(diff);
+ if ((va & SP_MASK) == 0 && va + SP_SIZE <= end &&
+ m->psind == 1 && moea64_ps_enabled(pm))
+ psind = 1;
+ else
+ psind = 0;
+ moea64_enter(pm, va, m, prot &
+ (VM_PROT_READ | VM_PROT_EXECUTE),
+ PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, psind);
+ if (psind == 1)
+ m = &m[SP_SIZE / PAGE_SIZE - 1];
m = TAILQ_NEXT(m, listq);
}
}
@@ -1615,7 +1781,7 @@
if (pvo == NULL)
pa = 0;
else
- pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo));
+ pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo));
PMAP_UNLOCK(pm);
return (pa);
@@ -1636,7 +1802,7 @@
PMAP_LOCK(pmap);
pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF);
if (pvo != NULL && (pvo->pvo_pte.prot & prot) == prot) {
- m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN);
+ m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
if (!vm_page_wire_mapped(m))
m = NULL;
}
@@ -1704,6 +1870,27 @@
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
UMA_ZONE_VM | UMA_ZONE_NOFREE);
+ /*
+ * Are large page mappings enabled?
+ */
+ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
+ if (pg_ps_enabled) {
+ KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
+ ("moea64_init: can't assign to pagesizes[1]"));
+
+ if (moea64_large_page_size == 0) {
+ printf("mmu_oea64: HW does not support large pages. "
+ "Disabling superpages...\n");
+ pg_ps_enabled = 0;
+ } else if (!moea64_has_lp_4k_16m) {
+ printf("mmu_oea64: "
+ "HW does not support mixed 4KB/16MB page sizes. "
+ "Disabling superpages...\n");
+ pg_ps_enabled = 0;
+ } else
+ pagesizes[1] = SP_SIZE;
+ }
+
if (!hw_direct_map) {
uma_zone_set_allocf(moea64_pvo_zone, moea64_uma_page_alloc);
}
@@ -1783,7 +1970,7 @@
vm_page_assert_busied(m);
if (!pmap_page_is_write_mapped(m))
- return
+ return;
powerpc_sync();
PV_PAGE_LOCK(m);
@@ -1793,6 +1980,11 @@
PMAP_LOCK(pmap);
if (!(pvo->pvo_vaddr & PVO_DEAD) &&
(pvo->pvo_pte.prot & VM_PROT_WRITE)) {
+ if (PVO_IS_SP(pvo)) {
+ CTR1(KTR_PMAP, "%s: demote before remwr",
+ __func__);
+ moea64_sp_demote(pvo);
+ }
pvo->pvo_pte.prot &= ~VM_PROT_WRITE;
ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE);
if (ret < 0)
@@ -1841,6 +2033,9 @@
pmap_t pmap;
uint64_t lo;
+ CTR3(KTR_PMAP, "%s: pa=%#jx, ma=%#x",
+ __func__, (uintmax_t)VM_PAGE_TO_PHYS(m), ma);
+
if ((m->oflags & VPO_UNMANAGED) != 0) {
m->md.mdpg_cache_attrs = ma;
return;
@@ -1853,6 +2048,11 @@
pmap = pvo->pvo_pmap;
PMAP_LOCK(pmap);
if (!(pvo->pvo_vaddr & PVO_DEAD)) {
+ if (PVO_IS_SP(pvo)) {
+ CTR1(KTR_PMAP,
+ "%s: demote before set_memattr", __func__);
+ moea64_sp_demote(pvo);
+ }
pvo->pvo_pte.pa &= ~LPTE_WIMG;
pvo->pvo_pte.pa |= lo;
refchg = moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE);
@@ -1943,7 +2143,7 @@
pvo = moea64_pvo_find_va(kernel_pmap, va);
KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR,
va));
- pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo));
+ pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo));
PMAP_UNLOCK(kernel_pmap);
return (pa);
}
@@ -2269,7 +2469,7 @@
*/
oldprot = pvo->pvo_pte.prot;
pvo->pvo_pte.prot = prot;
- pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN);
+ pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
/*
* If the PVO is in the page table, update mapping
@@ -2284,7 +2484,7 @@
if ((pg->oflags & VPO_UNMANAGED) == 0)
vm_page_aflag_set(pg, PGA_EXECUTABLE);
moea64_syncicache(pm, PVO_VADDR(pvo),
- pvo->pvo_pte.pa & LPTE_RPGN, PAGE_SIZE);
+ PVO_PADDR(pvo), PAGE_SIZE);
}
/*
@@ -2305,7 +2505,7 @@
moea64_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva,
vm_prot_t prot)
{
- struct pvo_entry *pvo, *tpvo, key;
+ struct pvo_entry *pvo, key;
CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm,
sva, eva, prot);
@@ -2321,8 +2521,18 @@
PMAP_LOCK(pm);
key.pvo_vaddr = sva;
for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
- pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
- tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
+ pvo != NULL && PVO_VADDR(pvo) < eva;
+ pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
+ if (PVO_IS_SP(pvo)) {
+ if (moea64_sp_pvo_in_range(pvo, sva, eva)) {
+ pvo = moea64_sp_protect(pvo, prot);
+ continue;
+ } else {
+ CTR1(KTR_PMAP, "%s: demote before protect",
+ __func__);
+ moea64_sp_demote(pvo);
+ }
+ }
moea64_pvo_protect(pm, pvo, prot);
}
PMAP_UNLOCK(pm);
@@ -2423,13 +2633,46 @@
}
}
+static void
+moea64_remove_locked(pmap_t pm, vm_offset_t sva, vm_offset_t eva,
+ struct pvo_dlist *tofree)
+{
+ struct pvo_entry *pvo, *tpvo, key;
+
+ PMAP_LOCK_ASSERT(pm, MA_OWNED);
+
+ key.pvo_vaddr = sva;
+ for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
+ pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
+ if (PVO_IS_SP(pvo)) {
+ if (moea64_sp_pvo_in_range(pvo, sva, eva)) {
+ tpvo = moea64_sp_remove(pvo, tofree);
+ continue;
+ } else {
+ CTR1(KTR_PMAP, "%s: demote before remove",
+ __func__);
+ moea64_sp_demote(pvo);
+ }
+ }
+ tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
+
+ /*
+ * For locking reasons, remove this from the page table and
+ * pmap, but save delinking from the vm_page for a second
+ * pass
+ */
+ moea64_pvo_remove_from_pmap(pvo);
+ SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink);
+ }
+}
+
/*
* Remove the given range of addresses from the specified map.
*/
void
moea64_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
{
- struct pvo_entry *pvo, *tpvo, key;
+ struct pvo_entry *pvo;
struct pvo_dlist tofree;
/*
@@ -2438,23 +2681,9 @@
if (pm->pm_stats.resident_count == 0)
return;
- key.pvo_vaddr = sva;
-
SLIST_INIT(&tofree);
-
PMAP_LOCK(pm);
- for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
- pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
- tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
-
- /*
- * For locking reasons, remove this from the page table and
- * pmap, but save delinking from the vm_page for a second
- * pass
- */
- moea64_pvo_remove_from_pmap(pvo);
- SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink);
- }
+ moea64_remove_locked(pm, sva, eva, &tofree);
PMAP_UNLOCK(pm);
while (!SLIST_EMPTY(&tofree)) {
@@ -2484,8 +2713,14 @@
pmap = pvo->pvo_pmap;
PMAP_LOCK(pmap);
wasdead = (pvo->pvo_vaddr & PVO_DEAD);
- if (!wasdead)
+ if (!wasdead) {
+ if (PVO_IS_SP(pvo)) {
+ CTR1(KTR_PMAP, "%s: demote before remove_all",
+ __func__);
+ moea64_sp_demote(pvo);
+ }
moea64_pvo_remove_from_pmap(pvo);
+ }
moea64_pvo_remove_from_page_locked(pvo, m);
if (!wasdead)
LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink);
@@ -2648,7 +2883,7 @@
/* Send RC bits to VM */
if ((pvo->pvo_vaddr & PVO_MANAGED) &&
(pvo->pvo_pte.prot & VM_PROT_WRITE)) {
- pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN);
+ pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
if (pg != NULL) {
refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs);
if (refchg & LPTE_CHG)
@@ -2674,7 +2909,7 @@
/*
* Update vm about page writeability/executability if managed
*/
- PV_LOCKASSERT(pvo->pvo_pte.pa & LPTE_RPGN);
+ PV_LOCKASSERT(PVO_PADDR(pvo));
if (pvo->pvo_vaddr & PVO_MANAGED) {
if (m != NULL) {
LIST_REMOVE(pvo, pvo_vlink);
@@ -2692,13 +2927,15 @@
moea64_pvo_remove_from_page(struct pvo_entry *pvo)
{
vm_page_t pg = NULL;
+ vm_paddr_t pa;
+ pa = PVO_PADDR(pvo);
if (pvo->pvo_vaddr & PVO_MANAGED)
- pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN);
+ pg = PHYS_TO_VM_PAGE(pa);
- PV_LOCK(pvo->pvo_pte.pa & LPTE_RPGN);
+ PV_LOCK(pa);
moea64_pvo_remove_from_page_locked(pvo, pg);
- PV_UNLOCK(pvo->pvo_pte.pa & LPTE_RPGN);
+ PV_UNLOCK(pa);
}
static struct pvo_entry *
@@ -2718,11 +2955,18 @@
struct pvo_entry *pvo;
int64_t ret;
boolean_t rv;
+ vm_page_t sp;
/*
* See if this bit is stored in the page already.
+ *
+ * For superpages, the bit is stored in the first vm page.
*/
- if (m->md.mdpg_attrs & ptebit)
+ if ((m->md.mdpg_attrs & ptebit) != 0 ||
+ ((sp = PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(m) & ~SP_MASK)) != NULL &&
+ (sp->md.mdpg_attrs & (ptebit | MDPG_ATTR_SP)) ==
+ (ptebit | MDPG_ATTR_SP))
+ )
return (TRUE);
/*
@@ -2733,6 +2977,21 @@
powerpc_sync();
PV_PAGE_LOCK(m);
LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
+ if (PVO_IS_SP(pvo)) {
+ ret = moea64_sp_query(pvo, ptebit);
+ /*
+ * If SP was not demoted, check its REF/CHG bits here.
+ */
+ if (ret != -1) {
+ if ((ret & ptebit) != 0) {
+ rv = TRUE;
+ break;
+ }
+ continue;
+ }
+ /* else, fallthrough */
+ }
+
ret = 0;
/*
@@ -2778,6 +3037,12 @@
count = 0;
PV_PAGE_LOCK(m);
LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
+ if (PVO_IS_SP(pvo)) {
+ if ((ret = moea64_sp_clear(pvo, m, ptebit)) != -1) {
+ count += ret;
+ continue;
+ }
+ }
ret = 0;
PMAP_LOCK(pvo->pvo_pmap);
@@ -2810,7 +3075,7 @@
for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key);
ppa < pa + size; ppa += PAGE_SIZE,
pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) {
- if (pvo == NULL || (pvo->pvo_pte.pa & LPTE_RPGN) != ppa) {
+ if (pvo == NULL || PVO_PADDR(pvo) != ppa) {
error = EFAULT;
break;
}
@@ -2886,7 +3151,7 @@
len = MIN(lim - va, sz);
pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF);
if (pvo != NULL && !(pvo->pvo_pte.pa & LPTE_I)) {
- pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va & ADDR_POFF);
+ pa = PVO_PADDR(pvo) | (va & ADDR_POFF);
moea64_syncicache(pm, va, pa, len);
}
va += len;
@@ -3005,7 +3270,7 @@
}
}
- pa = pvo->pvo_pte.pa & LPTE_RPGN;
+ pa = PVO_PADDR(pvo);
if (va & PVO_LARGE) {
pa_end = pa + lpsize;
@@ -3180,3 +3445,767 @@
DEFINE_OEA64_IFUNC(int64_t, pte_clear, (struct pvo_entry *, uint64_t),
moea64_null_method)
DEFINE_OEA64_IFUNC(int64_t, pte_synch, (struct pvo_entry *), moea64_null_method)
+
+/* Superpage functions */
+
+/* MMU interface */
+
+static bool
+moea64_ps_enabled(pmap_t pmap)
+{
+ return (pg_ps_enabled);
+}
+
+static void
+moea64_align_superpage(vm_object_t object, vm_ooffset_t offset,
+ vm_offset_t *addr, vm_size_t size)
+{
+ vm_offset_t sp_offset;
+
+ if (size < SP_SIZE)
+ return;
+
+ CTR4(KTR_PMAP, "%s: offs=%#jx, addr=%p, size=%#jx",
+ __func__, (uintmax_t)offset, addr, (uintmax_t)size);
+
+ if (object != NULL && (object->flags & OBJ_COLORED) != 0)
+ offset += ptoa(object->pg_color);
+ sp_offset = offset & SP_MASK;
+ if (size - ((SP_SIZE - sp_offset) & SP_MASK) < SP_SIZE ||
+ (*addr & SP_MASK) == sp_offset)
+ return;
+ if ((*addr & SP_MASK) < sp_offset)
+ *addr = (*addr & ~SP_MASK) + sp_offset;
+ else
+ *addr = ((*addr + SP_MASK) & ~SP_MASK) + sp_offset;
+}
+
+/* Helpers */
+
+static __inline void
+moea64_pvo_cleanup(struct pvo_dlist *tofree)
+{
+ struct pvo_entry *pvo;
+
+ /* clean up */
+ while (!SLIST_EMPTY(tofree)) {
+ pvo = SLIST_FIRST(tofree);
+ SLIST_REMOVE_HEAD(tofree, pvo_dlink);
+ if (pvo->pvo_vaddr & PVO_DEAD)
+ moea64_pvo_remove_from_page(pvo);
+ free_pvo_entry(pvo);
+ }
+}
+
+static __inline uint16_t
+pvo_to_vmpage_flags(struct pvo_entry *pvo)
+{
+ uint16_t flags;
+
+ flags = 0;
+ if ((pvo->pvo_pte.prot & VM_PROT_WRITE) != 0)
+ flags |= PGA_WRITEABLE;
+ if ((pvo->pvo_pte.prot & VM_PROT_EXECUTE) != 0)
+ flags |= PGA_EXECUTABLE;
+
+ return (flags);
+}
+
+/*
+ * Check if the given pvo and its superpage are in sva-eva range.
+ */
+static __inline bool
+moea64_sp_pvo_in_range(struct pvo_entry *pvo, vm_offset_t sva, vm_offset_t eva)
+{
+ vm_offset_t spva;
+
+ spva = PVO_VADDR(pvo) & ~SP_MASK;
+ if (spva >= sva && spva + SP_SIZE <= eva) {
+ /*
+ * Because this function is intended to be called from loops
+ * that iterate over ordered pvo entries, if the condition
+ * above is true then the pvo must be the first of its
+ * superpage.
+ */
+ KASSERT(PVO_VADDR(pvo) == spva,
+ ("%s: unexpected unaligned superpage pvo", __func__));
+ return (true);
+ }
+ return (false);
+}
+
+/*
+ * Update vm about the REF/CHG bits if the superpage is managed and
+ * has (or had) write access.
+ */
+static void
+moea64_sp_refchg_process(struct pvo_entry *sp, vm_page_t m,
+ int64_t sp_refchg, vm_prot_t prot)
+{
+ vm_page_t m_end;
+ int64_t refchg;
+
+ if ((sp->pvo_vaddr & PVO_MANAGED) != 0 && (prot & VM_PROT_WRITE) != 0) {
+ for (m_end = &m[SP_PAGES]; m < m_end; m++) {
+ refchg = sp_refchg |
+ atomic_readandclear_32(&m->md.mdpg_attrs);
+ if (refchg & LPTE_CHG)
+ vm_page_dirty(m);
+ if (refchg & LPTE_REF)
+ vm_page_aflag_set(m, PGA_REFERENCED);
+ }
+ }
+}
+
+/* Superpage ops */
+
+static int
+moea64_sp_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
+ vm_prot_t prot, u_int flags, int8_t psind)
+{
+ struct pvo_entry *pvo, **pvos;
+ struct pvo_head *pvo_head;
+ vm_offset_t sva;
+ vm_page_t sm;
+ vm_paddr_t pa;
+ bool sync;
+ struct pvo_dlist tofree;
+ int error, i;
+ uint16_t aflags;
+
+ KASSERT((va & SP_MASK) == 0, ("%s: va %#jx unaligned",
+ __func__, (uintmax_t)va));
+ KASSERT(psind == 1, ("%s: invalid psind: %d", __func__, psind));
+ KASSERT(m->psind == 1, ("%s: invalid m->psind: %d",
+ __func__, m->psind));
+ KASSERT(pmap != kernel_pmap,
+ ("%s: function called with kernel pmap", __func__));
+
+ CTR5(KTR_PMAP, "%s: va=%#jx, pa=%#jx, prot=%#x, flags=%#x, psind=1",
+ __func__, (uintmax_t)va, (uintmax_t)VM_PAGE_TO_PHYS(m),
+ prot, flags);
+
+ SLIST_INIT(&tofree);
+
+ sva = va;
+ sm = m;
+ pa = VM_PAGE_TO_PHYS(sm);
+
+ /* Try to allocate all PVOs first, to make failure handling easier. */
+ pvos = malloc(SP_PAGES * sizeof(struct pvo_entry *), M_TEMP, M_NOWAIT);
+ if (pvos == NULL) {
+ CTR1(KTR_PMAP, "%s: failed to alloc pvo array", __func__);
+ return (KERN_RESOURCE_SHORTAGE);
+ }
+
+ for (i = 0; i < SP_PAGES; i++) {
+ pvos[i] = alloc_pvo_entry(0);
+ if (pvos[i] == NULL) {
+ CTR1(KTR_PMAP, "%s: failed to alloc pvo", __func__);
+ for (i = i - 1; i >= 0; i--)
+ free_pvo_entry(pvos[i]);
+ free(pvos, M_TEMP);
+ return (KERN_RESOURCE_SHORTAGE);
+ }
+ }
+
+ PV_PAGE_LOCK(sm);
+ PMAP_LOCK(pmap);
+
+ /* Note: moea64_remove_locked() also clears cached REF/CHG bits. */
+ moea64_remove_locked(pmap, va, va + SP_SIZE, &tofree);
+
+ /* Enter pages */
+ for (i = 0; i < SP_PAGES;
+ i++, va += PAGE_SIZE, pa += PAGE_SIZE, m++) {
+ pvo = pvos[i];
+
+ pvo->pvo_pte.prot = prot;
+ pvo->pvo_pte.pa = (pa & ~LPTE_LP_MASK) | LPTE_LP_4K_16M |
+ moea64_calc_wimg(pa, pmap_page_get_memattr(m));
+
+ if ((flags & PMAP_ENTER_WIRED) != 0)
+ pvo->pvo_vaddr |= PVO_WIRED;
+ pvo->pvo_vaddr |= PVO_LARGE;
+
+ if ((m->oflags & VPO_UNMANAGED) != 0)
+ pvo_head = NULL;
+ else {
+ pvo_head = &m->md.mdpg_pvoh;
+ pvo->pvo_vaddr |= PVO_MANAGED;
+ }
+
+ init_pvo_entry(pvo, pmap, va);
+
+ error = moea64_pvo_enter(pvo, pvo_head, NULL);
+ /*
+ * All superpage PVOs were previously removed, so no errors
+ * should occur while inserting the new ones.
+ */
+ KASSERT(error == 0, ("%s: unexpected error "
+ "when inserting superpage PVO: %d",
+ __func__, error));
+ }
+
+ PMAP_UNLOCK(pmap);
+ PV_PAGE_UNLOCK(sm);
+
+ sync = (sm->a.flags & PGA_EXECUTABLE) == 0;
+ /* Note: moea64_pvo_cleanup() also clears page prot. flags. */
+ moea64_pvo_cleanup(&tofree);
+ pvo = pvos[0];
+
+ /* Set vm page flags */
+ aflags = pvo_to_vmpage_flags(pvo);
+ if (aflags != 0)
+ for (m = sm; m < &sm[SP_PAGES]; m++)
+ vm_page_aflag_set(m, aflags);
+
+ /*
+ * Flush the page from the instruction cache if this page is
+ * mapped executable and cacheable.
+ */
+ if (sync && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0)
+ moea64_syncicache(pmap, sva, VM_PAGE_TO_PHYS(sm), SP_SIZE);
+
+ atomic_add_long(&sp_mappings, 1);
+ CTR3(KTR_PMAP, "%s: SP success for va %#jx in pmap %p",
+ __func__, (uintmax_t)sva, pmap);
+
+ free(pvos, M_TEMP);
+ return (KERN_SUCCESS);
+}
+
+static int
+moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m)
+{
+ struct pvo_entry *first, *pvo;
+ vm_paddr_t pa, pa_end;
+ vm_offset_t sva, va_end;
+ int64_t sp_refchg;
+
+ /* Return if page promotion is not possible. */
+ if ((m->flags & PG_FICTITIOUS) != 0 ||
+ vm_reserv_level_iffullpop(m) != 0 || !moea64_ps_enabled(pmap))
+ return (1);
+
+ /* This CTR may generate a lot of output. */
+ /* CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)va); */
+
+ va &= ~SP_MASK;
+ sva = va;
+ /* Get superpage */
+ m = PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(m) & ~SP_MASK);
+
+ PV_PAGE_LOCK(m);
+ PMAP_LOCK(pmap);
+
+ /*
+ * Check if all pages meet promotion criteria.
+ *
+ * XXX In some cases the loop below may be executed for each or most
+ * of the entered pages of a superpage, which can be expensive
+ * (although it was not profiled) and need some optimization.
+ *
+ * Some cases where this seems to happen are:
+ * - When a superpage is first entered read-only and later becomes
+ * read-write.
+ * - When some of the superpage's virtual addresses map to previously
+ * wired/cached pages while others map to pages allocated from a
+ * different physical address range. A common scenario where this
+ * happens is when mmap'ing a file that is already present in FS
+ * block cache and doesn't fill a superpage.
+ */
+ first = pvo = moea64_pvo_find_va(pmap, sva);
+ for (pa = VM_PAGE_TO_PHYS(m), pa_end = pa + SP_SIZE;
+ pa < pa_end; pa += PAGE_SIZE, va += PAGE_SIZE) {
+ if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
+ CTR3(KTR_PMAP,
+ "%s: NULL or dead PVO: pmap=%p, va=%#jx",
+ __func__, pmap, (uintmax_t)va);
+ goto error;
+ }
+ if (PVO_PADDR(pvo) != pa) {
+ CTR5(KTR_PMAP, "%s: PAs don't match: "
+ "pmap=%p, va=%#jx, pvo_pa=%#jx, exp_pa=%#jx",
+ __func__, pmap, (uintmax_t)va,
+ (uintmax_t)PVO_PADDR(pvo), (uintmax_t)pa);
+ atomic_add_long(&sp_p_fail_pa, 1);
+ goto error;
+ }
+ if ((first->pvo_vaddr & PVO_PROMOTE) !=
+ (pvo->pvo_vaddr & PVO_PROMOTE)) {
+ CTR5(KTR_PMAP, "%s: PVO flags don't match: "
+ "pmap=%p, va=%#jx, pvo_flags=%#jx, exp_flags=%#jx",
+ __func__, pmap, (uintmax_t)va,
+ (uintmax_t)(pvo->pvo_vaddr & PVO_PROMOTE),
+ (uintmax_t)(first->pvo_vaddr & PVO_PROMOTE));
+ atomic_add_long(&sp_p_fail_flags, 1);
+ goto error;
+ }
+ if (first->pvo_pte.prot != pvo->pvo_pte.prot) {
+ CTR5(KTR_PMAP, "%s: PVO protections don't match: "
+ "pmap=%p, va=%#jx, pvo_prot=%#x, exp_prot=%#x",
+ __func__, pmap, (uintmax_t)va,
+ pvo->pvo_pte.prot, first->pvo_pte.prot);
+ atomic_add_long(&sp_p_fail_prot, 1);
+ goto error;
+ }
+ if ((first->pvo_pte.pa & LPTE_WIMG) !=
+ (pvo->pvo_pte.pa & LPTE_WIMG)) {
+ CTR5(KTR_PMAP, "%s: WIMG bits don't match: "
+ "pmap=%p, va=%#jx, pvo_wimg=%#jx, exp_wimg=%#jx",
+ __func__, pmap, (uintmax_t)va,
+ (uintmax_t)(pvo->pvo_pte.pa & LPTE_WIMG),
+ (uintmax_t)(first->pvo_pte.pa & LPTE_WIMG));
+ atomic_add_long(&sp_p_fail_wimg, 1);
+ goto error;
+ }
+
+ pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo);
+ }
+
+ /* All OK, promote. */
+
+ /*
+ * Handle superpage REF/CHG bits. If REF or CHG is set in
+ * any page, then it must be set in the superpage.
+ *
+ * Instead of querying each page, we take advantage of two facts:
+ * 1- If a page is being promoted, it was referenced.
+ * 2- If promoted pages are writable, they were modified.
+ */
+ sp_refchg = LPTE_REF |
+ ((first->pvo_pte.prot & VM_PROT_WRITE) != 0 ? LPTE_CHG : 0);
+
+ /* Promote pages */
+ pvo = first;
+ for (va = PVO_VADDR(pvo), va_end = va + SP_SIZE;
+ va < va_end; va += PAGE_SIZE) {
+ KASSERT(pvo && PVO_VADDR(pvo) == va,
+ ("%s: missing PVO for va %#jx", __func__, (uintmax_t)va));
+ pvo->pvo_pte.pa &= ~LPTE_LP_MASK;
+ pvo->pvo_pte.pa |= LPTE_LP_4K_16M;
+ pvo->pvo_vaddr |= PVO_LARGE;
+
+ moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE);
+
+ pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo);
+ }
+
+ /* Send REF/CHG bits to VM */
+ moea64_sp_refchg_process(first, m, sp_refchg, first->pvo_pte.prot);
+
+ /* Use first page to cache REF/CHG bits */
+ atomic_set_32(&m->md.mdpg_attrs, sp_refchg | MDPG_ATTR_SP);
+
+ PMAP_UNLOCK(pmap);
+ PV_PAGE_UNLOCK(m);
+
+ atomic_add_long(&sp_mappings, 1);
+ atomic_add_long(&sp_promotions, 1);
+ CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p",
+ __func__, (uintmax_t)sva, pmap);
+
+ return (0);
+
+error:
+ atomic_add_long(&sp_p_failures, 1);
+ PMAP_UNLOCK(pmap);
+ PV_PAGE_UNLOCK(m);
+ return (1);
+}
+
+static void
+moea64_sp_demote_aligned(struct pvo_entry *sp)
+{
+ struct pvo_entry *pvo;
+ vm_offset_t va, va_end;
+ vm_paddr_t pa;
+ vm_page_t m;
+ pmap_t pmap;
+ int64_t ret, refchg;
+
+ CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp));
+
+ pmap = sp->pvo_pmap;
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
+ pvo = sp;
+ pa = PVO_PADDR(pvo);
+ m = PHYS_TO_VM_PAGE(pa);
+ refchg = 0;
+
+ /* Demote pages */
+ for (va = PVO_VADDR(pvo), va_end = va + SP_SIZE;
+ va < va_end; va += PAGE_SIZE, pa += PAGE_SIZE) {
+ KASSERT(pvo && PVO_VADDR(pvo) == va,
+ ("%s: missing PVO for va %#jx", __func__, (uintmax_t)va));
+ pvo->pvo_vaddr &= ~PVO_LARGE;
+ pvo->pvo_pte.pa &= ~LPTE_RPGN;
+ pvo->pvo_pte.pa |= pa;
+
+ ret = moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE);
+ if (ret < 0)
+ refchg |= LPTE_CHG;
+ else
+ refchg |= ret;
+
+ pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo);
+ }
+
+ /* Clear SP flag */
+ atomic_clear_32(&m->md.mdpg_attrs, MDPG_ATTR_SP);
+
+ /*
+ * Handle superpage REF/CHG bits. A bit set in the superpage
+ * means all pages should consider it set.
+ */
+ moea64_sp_refchg_process(sp, m, refchg, sp->pvo_pte.prot);
+
+ atomic_add_long(&sp_demotions, 1);
+ CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p",
+ __func__, (uintmax_t)PVO_VADDR(sp), pmap);
+}
+
+static void
+moea64_sp_demote(struct pvo_entry *pvo)
+{
+ PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
+
+ if ((PVO_VADDR(pvo) & SP_MASK) != 0) {
+ pvo = moea64_pvo_find_va(pvo->pvo_pmap,
+ PVO_VADDR(pvo) & ~SP_MASK);
+ KASSERT(pvo != NULL, ("%s: missing PVO for va %#jx",
+ __func__, (uintmax_t)(PVO_VADDR(pvo) & ~SP_MASK)));
+ }
+ moea64_sp_demote_aligned(pvo);
+}
+
+static struct pvo_entry *
+moea64_sp_unwire(struct pvo_entry *sp)
+{
+ struct pvo_entry *pvo, *prev;
+ vm_offset_t eva;
+ pmap_t pm;
+ int64_t ret, refchg;
+
+ CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp));
+
+ pm = sp->pvo_pmap;
+ PMAP_LOCK_ASSERT(pm, MA_OWNED);
+
+ eva = PVO_VADDR(sp) + SP_SIZE;
+ refchg = 0;
+ for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
+ prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
+ if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
+ panic("%s: pvo %p is missing PVO_WIRED",
+ __func__, pvo);
+ pvo->pvo_vaddr &= ~PVO_WIRED;
+
+ ret = moea64_pte_replace(pvo, 0 /* No invalidation */);
+ if (ret < 0)
+ refchg |= LPTE_CHG;
+ else
+ refchg |= ret;
+
+ pm->pm_stats.wired_count--;
+ }
+
+ /* Send REF/CHG bits to VM */
+ moea64_sp_refchg_process(sp, PHYS_TO_VM_PAGE(PVO_PADDR(sp)),
+ refchg, sp->pvo_pte.prot);
+
+ return (prev);
+}
+
+static struct pvo_entry *
+moea64_sp_protect(struct pvo_entry *sp, vm_prot_t prot)
+{
+ struct pvo_entry *pvo, *prev;
+ vm_offset_t eva;
+ pmap_t pm;
+ vm_page_t m, m_end;
+ int64_t ret, refchg;
+ vm_prot_t oldprot;
+
+ CTR3(KTR_PMAP, "%s: va=%#jx, prot=%x",
+ __func__, (uintmax_t)PVO_VADDR(sp), prot);
+
+ pm = sp->pvo_pmap;
+ PMAP_LOCK_ASSERT(pm, MA_OWNED);
+
+ oldprot = sp->pvo_pte.prot;
+ m = PHYS_TO_VM_PAGE(PVO_PADDR(sp));
+ KASSERT(m != NULL, ("%s: missing vm page for pa %#jx",
+ __func__, (uintmax_t)PVO_PADDR(sp)));
+ eva = PVO_VADDR(sp) + SP_SIZE;
+ refchg = 0;
+
+ for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
+ prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
+ pvo->pvo_pte.prot = prot;
+ /*
+ * If the PVO is in the page table, update mapping
+ */
+ ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE);
+ if (ret < 0)
+ refchg |= LPTE_CHG;
+ else
+ refchg |= ret;
+ }
+
+ /* Send REF/CHG bits to VM */
+ moea64_sp_refchg_process(sp, m, refchg, oldprot);
+
+ /* Handle pages that became executable */
+ if ((m->a.flags & PGA_EXECUTABLE) == 0 &&
+ (sp->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
+ if ((m->oflags & VPO_UNMANAGED) == 0)
+ for (m_end = &m[SP_PAGES]; m < m_end; m++)
+ vm_page_aflag_set(m, PGA_EXECUTABLE);
+ moea64_syncicache(pm, PVO_VADDR(sp), PVO_PADDR(sp), SP_SIZE);
+ }
+
+ return (prev);
+}
+
+static struct pvo_entry *
+moea64_sp_remove(struct pvo_entry *sp, struct pvo_dlist *tofree)
+{
+ struct pvo_entry *pvo, *tpvo;
+ vm_offset_t eva;
+ pmap_t pm;
+
+ CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp));
+
+ pm = sp->pvo_pmap;
+ PMAP_LOCK_ASSERT(pm, MA_OWNED);
+
+ eva = PVO_VADDR(sp) + SP_SIZE;
+ for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
+ tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
+
+ /*
+ * For locking reasons, remove this from the page table and
+ * pmap, but save delinking from the vm_page for a second
+ * pass
+ */
+ moea64_pvo_remove_from_pmap(pvo);
+ SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink);
+ }
+
+ /* Clear SP bit */
+ atomic_clear_32(&PHYS_TO_VM_PAGE(PVO_PADDR(sp))->md.mdpg_attrs,
+ MDPG_ATTR_SP);
+
+ return (tpvo);
+}
+
+static int64_t
+moea64_sp_query_locked(struct pvo_entry *pvo, uint64_t ptebit)
+{
+ int64_t refchg, ret;
+ vm_offset_t eva;
+ vm_page_t m;
+ pmap_t pmap;
+ struct pvo_entry *sp;
+
+ pmap = pvo->pvo_pmap;
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
+ /* Get first SP PVO */
+ if ((PVO_VADDR(pvo) & SP_MASK) != 0) {
+ sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~SP_MASK);
+ KASSERT(sp != NULL, ("%s: missing PVO for va %#jx",
+ __func__, (uintmax_t)(PVO_VADDR(pvo) & ~SP_MASK)));
+ } else
+ sp = pvo;
+ eva = PVO_VADDR(sp) + SP_SIZE;
+
+ refchg = 0;
+ for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
+ pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) {
+ ret = moea64_pte_synch(pvo);
+ if (ret > 0) {
+ refchg |= ret & (LPTE_CHG | LPTE_REF);
+ if ((refchg & ptebit) != 0)
+ break;
+ }
+ }
+
+ /* Save results */
+ if (refchg != 0) {
+ m = PHYS_TO_VM_PAGE(PVO_PADDR(sp));
+ atomic_set_32(&m->md.mdpg_attrs, refchg | MDPG_ATTR_SP);
+ }
+
+ return (refchg);
+}
+
+static int64_t
+moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit)
+{
+ int64_t refchg;
+ pmap_t pmap;
+
+ pmap = pvo->pvo_pmap;
+ PMAP_LOCK(pmap);
+
+ /*
+ * Check if SP was demoted/removed before pmap lock was acquired.
+ */
+ if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
+ CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx",
+ __func__, (uintmax_t)PVO_PADDR(pvo));
+ PMAP_UNLOCK(pmap);
+ return (-1);
+ }
+
+ refchg = moea64_sp_query_locked(pvo, ptebit);
+ PMAP_UNLOCK(pmap);
+
+ CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx",
+ __func__, (uintmax_t)PVO_VADDR(pvo),
+ (uintmax_t)PVO_PADDR(pvo), (uintmax_t)refchg);
+
+ return (refchg);
+}
+
+static int64_t
+moea64_sp_pvo_clear(struct pvo_entry *pvo, uint64_t ptebit)
+{
+ int64_t refchg, ret;
+ pmap_t pmap;
+ struct pvo_entry *sp;
+ vm_offset_t eva;
+ vm_page_t m;
+
+ pmap = pvo->pvo_pmap;
+ PMAP_LOCK(pmap);
+
+ /*
+ * Check if SP was demoted/removed before pmap lock was acquired.
+ */
+ if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
+ CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx",
+ __func__, (uintmax_t)PVO_PADDR(pvo));
+ PMAP_UNLOCK(pmap);
+ return (-1);
+ }
+
+ /* Get first SP PVO */
+ if ((PVO_VADDR(pvo) & SP_MASK) != 0) {
+ sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~SP_MASK);
+ KASSERT(sp != NULL, ("%s: missing PVO for va %#jx",
+ __func__, (uintmax_t)(PVO_VADDR(pvo) & ~SP_MASK)));
+ } else
+ sp = pvo;
+ eva = PVO_VADDR(sp) + SP_SIZE;
+
+ refchg = 0;
+ for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
+ pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) {
+ ret = moea64_pte_clear(pvo, ptebit);
+ if (ret > 0)
+ refchg |= ret & (LPTE_CHG | LPTE_REF);
+ }
+
+ m = PHYS_TO_VM_PAGE(PVO_PADDR(sp));
+ atomic_clear_32(&m->md.mdpg_attrs, ptebit);
+ PMAP_UNLOCK(pmap);
+
+ CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx",
+ __func__, (uintmax_t)PVO_VADDR(sp),
+ (uintmax_t)PVO_PADDR(sp), (uintmax_t)refchg);
+
+ return (refchg);
+}
+
+static int64_t
+moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m, uint64_t ptebit)
+{
+ int64_t count, ret;
+ pmap_t pmap;
+
+ count = 0;
+ pmap = pvo->pvo_pmap;
+
+ /*
+ * Since this reference bit is shared by 4096 4KB pages, it
+ * should not be cleared every time it is tested. Apply a
+ * simple "hash" function on the physical page number, the
+ * virtual superpage number, and the pmap address to select
+ * one 4KB page out of the 4096 on which testing the
+ * reference bit will result in clearing that reference bit.
+ * This function is designed to avoid the selection of the
+ * same 4KB page for every 16MB page mapping.
+ *
+ * Always leave the reference bit of a wired mapping set, as
+ * the current state of its reference bit won't affect page
+ * replacement.
+ */
+ if (ptebit == LPTE_REF && (((VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) ^
+ (PVO_VADDR(pvo) >> SP_SHIFT) ^ (uintptr_t)pmap) &
+ (SP_PAGES - 1)) == 0 && (pvo->pvo_vaddr & PVO_WIRED) == 0) {
+ if ((ret = moea64_sp_pvo_clear(pvo, ptebit)) == -1)
+ return (-1);
+
+ if ((ret & ptebit) != 0)
+ count++;
+
+ /*
+ * If this page was not selected by the hash function, then assume
+ * its REF bit was set.
+ */
+ } else if (ptebit == LPTE_REF) {
+ count++;
+
+ /*
+ * To clear the CHG bit of a single SP page, first it must be demoted.
+ * But if no CHG bit is set, no bit clear and thus no SP demotion is
+ * needed.
+ */
+ } else {
+ CTR4(KTR_PMAP, "%s: ptebit=%#jx, va=%#jx, pa=%#jx",
+ __func__, (uintmax_t)ptebit, (uintmax_t)PVO_VADDR(pvo),
+ (uintmax_t)PVO_PADDR(pvo));
+
+ PMAP_LOCK(pmap);
+
+ /*
+ * Make sure SP wasn't demoted/removed before pmap lock
+ * was acquired.
+ */
+ if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
+ CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx",
+ __func__, (uintmax_t)PVO_PADDR(pvo));
+ PMAP_UNLOCK(pmap);
+ return (-1);
+ }
+
+ ret = moea64_sp_query_locked(pvo, ptebit);
+ if ((ret & ptebit) != 0)
+ count++;
+ else {
+ PMAP_UNLOCK(pmap);
+ return (0);
+ }
+
+ moea64_sp_demote(pvo);
+ moea64_pte_clear(pvo, ptebit);
+
+ /*
+ * Write protect the mapping to a single page so that a
+ * subsequent write access may repromote.
+ */
+ if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
+ moea64_pvo_protect(pmap, pvo,
+ pvo->pvo_pte.prot & ~VM_PROT_WRITE);
+
+ PMAP_UNLOCK(pmap);
+ }
+
+ return (count);
+}
Index: sys/powerpc/aim/moea64_native.c
===================================================================
--- sys/powerpc/aim/moea64_native.c
+++ sys/powerpc/aim/moea64_native.c
@@ -117,6 +117,7 @@
#include <machine/cpu.h>
#include <machine/hid.h>
+#include <machine/ifunc.h>
#include <machine/md_var.h>
#include <machine/mmuvar.h>
@@ -132,11 +133,65 @@
/* POWER9 only permits a 64k partition table size. */
#define PART_SIZE 0x10000
+/* Actual page sizes (to be used with tlbie, when L=0) */
+#define AP_4K 0x00
+#define AP_16M 0x80
+
+#define LPTE_KERNEL_VSID_BIT (KERNEL_VSID_BIT << \
+ (16 - (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)))
+
static bool moea64_crop_tlbie;
static bool moea64_need_lock;
+#ifdef __powerpc64__
+
+/*
+ * The tlbie instruction has two forms: an old one used by PowerISA
+ * 2.03 and prior, and a newer one used by PowerISA 2.06 and later.
+ * We need to support both.
+ */
+
+static void
+__tlbie_old(uint64_t vpn, uint64_t oldptehi)
+{
+ if ((oldptehi & LPTE_BIG) != 0)
+ __asm __volatile("tlbie %0, 1" :: "r"(vpn) : "memory");
+ else
+ __asm __volatile("tlbie %0, 0" :: "r"(vpn) : "memory");
+ __asm __volatile("eieio; tlbsync; ptesync" ::: "memory");
+}
+
+static void
+__tlbie_new(uint64_t vpn, uint64_t oldptehi)
+{
+ uint64_t rb;
+
+ /*
+ * If this page has LPTE_BIG set and is from userspace, then
+ * it must be a superpage with 4KB base/16MB actual page size.
+ */
+ rb = vpn;
+ if ((oldptehi & LPTE_BIG) != 0 &&
+ (oldptehi & LPTE_KERNEL_VSID_BIT) == 0)
+ rb |= AP_16M;
+
+ __asm __volatile("li 0, 0 \n tlbie %0, 0" :: "r"(rb) : "r0", "memory");
+ __asm __volatile("eieio; tlbsync; ptesync" ::: "memory");
+}
+
+DEFINE_IFUNC(, void, __tlbie, (uint64_t vpn, uint64_t oldptehi))
+{
+ if (cpu_features & PPC_FEATURE_ARCH_2_06)
+ return (__tlbie_new);
+ else
+ return (__tlbie_old);
+}
+
+#endif
+
static __inline void
-TLBIE(uint64_t vpn) {
+TLBIE(uint64_t vpn, uint64_t oldptehi)
+{
#ifndef __powerpc64__
register_t vpn_hi, vpn_lo;
register_t msr;
@@ -158,18 +213,7 @@
}
#ifdef __powerpc64__
- /*
- * Explicitly clobber r0. The tlbie instruction has two forms: an old
- * one used by PowerISA 2.03 and prior, and a newer one used by PowerISA
- * 2.06 (maybe 2.05?) and later. We need to support both, and it just
- * so happens that since we use 4k pages we can simply zero out r0, and
- * clobber it, and the assembler will interpret the single-operand form
- * of tlbie as having RB set, and everything else as 0. The RS operand
- * in the newer form is in the same position as the L(page size) bit of
- * the old form, so a slong as RS is 0, we're good on both sides.
- */
- __asm __volatile("li 0, 0 \n tlbie %0" :: "r"(vpn) : "r0", "memory");
- __asm __volatile("eieio; tlbsync; ptesync" ::: "memory");
+ __tlbie(vpn, oldptehi);
#else
vpn_hi = (uint32_t)(vpn >> 32);
vpn_lo = (uint32_t)vpn;
@@ -321,7 +365,7 @@
rw_runlock(&moea64_eviction_lock);
critical_enter();
- TLBIE(pvo->pvo_vpn);
+ TLBIE(pvo->pvo_vpn, properpt.pte_hi);
critical_exit();
} else {
rw_runlock(&moea64_eviction_lock);
@@ -356,7 +400,7 @@
critical_enter();
pt->pte_hi = be64toh((pt->pte_hi & ~LPTE_VALID) | LPTE_LOCKED);
PTESYNC();
- TLBIE(pvo->pvo_vpn);
+ TLBIE(pvo->pvo_vpn, pt->pte_hi);
ptelo = be64toh(pt->pte_lo);
*((volatile int32_t *)(&pt->pte_hi) + 1) = 0; /* Release lock */
critical_exit();
@@ -394,7 +438,7 @@
critical_enter();
pt->pte_hi = be64toh((pt->pte_hi & ~LPTE_VALID) | LPTE_LOCKED);
PTESYNC();
- TLBIE(pvo->pvo_vpn);
+ TLBIE(pvo->pvo_vpn, pt->pte_hi);
ptelo = be64toh(pt->pte_lo);
EIEIO();
pt->pte_lo = htobe64(properpt.pte_lo);
@@ -702,7 +746,7 @@
va |= (oldptehi & LPTE_AVPN_MASK) <<
(ADDR_API_SHFT64 - ADDR_PIDX_SHFT);
PTESYNC();
- TLBIE(va);
+ TLBIE(va, oldptehi);
STAT_MOEA64(moea64_pte_valid--);
STAT_MOEA64(moea64_pte_overflow++);
}
Index: sys/powerpc/include/param.h
===================================================================
--- sys/powerpc/include/param.h
+++ sys/powerpc/include/param.h
@@ -120,6 +120,15 @@
#define L3_PAGE_SIZE (1UL<<L3_PAGE_SIZE_SHIFT)
#define L3_PAGE_MASK (L3_PAGE_SIZE-1)
+/*
+ * On PowerPC64, make PDRSHIFT cover a 16MB superpage (HPT).
+ * This allows a single PV lock to protect all pages of a superpage.
+ * This is not needed with Radix MMU, but should do no harm either.
+ */
+#ifdef __powerpc64__
+#define PDRSHIFT 24
+#endif
+
#define MAXPAGESIZES 3 /* maximum number of supported page sizes */
#define RELOCATABLE_KERNEL 1 /* kernel may relocate during startup */
Index: sys/powerpc/include/pmap.h
===================================================================
--- sys/powerpc/include/pmap.h
+++ sys/powerpc/include/pmap.h
@@ -149,8 +149,8 @@
#define PVO_MANAGED 0x020UL /* PVO entry is managed */
#define PVO_BOOTSTRAP 0x080UL /* PVO entry allocated during
bootstrap */
-#define PVO_DEAD 0x100UL /* waiting to be deleted */
-#define PVO_LARGE 0x200UL /* large page */
+#define PVO_DEAD 0x100UL /* waiting to be deleted */
+#define PVO_LARGE 0x200UL /* large page */
#define PVO_VADDR(pvo) ((pvo)->pvo_vaddr & ~ADDR_POFF)
#define PVO_PTEGIDX_GET(pvo) ((pvo)->pvo_vaddr & PVO_PTEGIDX_MASK)
#define PVO_PTEGIDX_ISSET(pvo) ((pvo)->pvo_vaddr & PVO_PTEGIDX_VALID)
Index: sys/powerpc/include/pte.h
===================================================================
--- sys/powerpc/include/pte.h
+++ sys/powerpc/include/pte.h
@@ -120,8 +120,13 @@
#define LPTE_VALID 0x0000000000000001ULL
/* Low quadword: */
+#define LP_4K_16M 0x38 /* 4KB base, 16MB actual page size */
+
#define EXTEND_PTE(x) UINT64_C(x) /* make constants 64-bit */
#define LPTE_RPGN 0xfffffffffffff000ULL
+#define LPTE_LP_MASK 0x00000000000ff000ULL
+#define LPTE_LP_SHIFT 12
+#define LPTE_LP_4K_16M ((unsigned long long)(LP_4K_16M) << LPTE_LP_SHIFT)
#define LPTE_REF EXTEND_PTE( PTE_REF )
#define LPTE_CHG EXTEND_PTE( PTE_CHG )
#define LPTE_WIMG EXTEND_PTE( PTE_WIMG )
Index: sys/powerpc/include/slb.h
===================================================================
--- sys/powerpc/include/slb.h
+++ sys/powerpc/include/slb.h
@@ -64,6 +64,14 @@
#define SLBE_ESID_MASK 0xfffffffff0000000UL /* Effective segment ID mask */
#define SLBE_ESID_SHIFT 28
+/*
+ * SLB page sizes encoding, as present in property ibm,segment-page-sizes
+ * of CPU device tree node.
+ *
+ * See LoPAPR: CPU Node Properties, section C.6.1.4.
+ */
+#define SLB_PGSZ_4K_4K 0
+
/* Virtual real-mode VSID in LPARs */
#define VSID_VRMA 0x1ffffff
Index: sys/powerpc/include/vmparam.h
===================================================================
--- sys/powerpc/include/vmparam.h
+++ sys/powerpc/include/vmparam.h
@@ -186,30 +186,41 @@
#define VM_FREELIST_DEFAULT 0
/*
- * The largest allocation size is 4MB.
+ * By default, enable superpages for PPC64, except for BOOKE (that uses
+ * a different MMU).
*/
+#if defined(__powerpc64__) && !defined(BOOKE)
+#define PPC_SUPERPAGES
+#endif
+
#ifdef __powerpc64__
+/* The largest allocation size is 16MB. */
#define VM_NFREEORDER 13
#else
+/* The largest allocation size is 4MB. */
#define VM_NFREEORDER 11
#endif
#ifndef VM_NRESERVLEVEL
#ifdef __powerpc64__
+/* Enable superpage reservations: 1 level. */
#define VM_NRESERVLEVEL 1
#else
-/*
- * Disable superpage reservations.
- */
+/* Disable superpage reservations. */
#define VM_NRESERVLEVEL 0
#endif
#endif
-/*
- * Level 0 reservations consist of 512 pages.
- */
#ifndef VM_LEVEL_0_ORDER
-#define VM_LEVEL_0_ORDER 9
+/* Level 0 reservations consist of 512 (RPT) or 4096 (HPT) pages. */
+#define VM_LEVEL_0_ORDER vm_level_0_order
+#ifndef __ASSEMBLER__
+extern int vm_level_0_order;
+#endif
+#endif
+
+#ifndef VM_LEVEL_0_ORDER_MAX
+#define VM_LEVEL_0_ORDER_MAX 12
#endif
#ifdef __powerpc64__
Index: sys/powerpc/powernv/platform_powernv.c
===================================================================
--- sys/powerpc/powernv/platform_powernv.c
+++ sys/powerpc/powernv/platform_powernv.c
@@ -142,6 +142,7 @@
phandle_t opal;
int res, len, idx;
register_t msr;
+ bool has_lp;
/* Ping OPAL again just to make sure */
opal_check();
@@ -225,6 +226,7 @@
sizeof(arr));
len /= 4;
idx = 0;
+ has_lp = false;
while (len > 0) {
shift = arr[idx];
slb_encoding = arr[idx + 1];
@@ -235,17 +237,21 @@
lp_size = arr[idx];
lp_encoding = arr[idx+1];
if (slb_encoding == SLBV_L && lp_encoding == 0)
- break;
+ has_lp = true;
+
+ if (slb_encoding == SLB_PGSZ_4K_4K &&
+ lp_encoding == LP_4K_16M)
+ moea64_has_lp_4k_16m = true;
idx += 2;
len -= 2;
nptlp--;
}
- if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0)
+ if (has_lp && moea64_has_lp_4k_16m)
break;
}
- if (len == 0)
+ if (!has_lp)
panic("Standard large pages (SLB[L] = 1, PTE[LP] = 0) "
"not supported by this system.");
Index: sys/powerpc/powerpc/pmap_dispatch.c
===================================================================
--- sys/powerpc/powerpc/pmap_dispatch.c
+++ sys/powerpc/powerpc/pmap_dispatch.c
@@ -77,6 +77,8 @@
caddr_t crashdumpmap;
int pmap_bootstrapped;
+/* Default level 0 reservations consist of 512 pages (2MB superpage). */
+int vm_level_0_order = 9;
#ifdef AIM
int
Index: sys/powerpc/pseries/mmu_phyp.c
===================================================================
--- sys/powerpc/pseries/mmu_phyp.c
+++ sys/powerpc/pseries/mmu_phyp.c
@@ -135,6 +135,7 @@
uint64_t vsid;
phandle_t dev, node, root;
int idx, len, res;
+ bool has_lp;
rm_init(&mphyp_eviction_lock, "pte eviction");
@@ -199,6 +200,7 @@
sizeof(arr));
len /= 4;
idx = 0;
+ has_lp = false;
while (len > 0) {
shift = arr[idx];
slb_encoding = arr[idx + 1];
@@ -220,18 +222,22 @@
lp_encoding);
if (slb_encoding == SLBV_L && lp_encoding == 0)
- break;
+ has_lp = true;
+
+ if (slb_encoding == SLB_PGSZ_4K_4K &&
+ lp_encoding == LP_4K_16M)
+ moea64_has_lp_4k_16m = true;
idx += 2;
len -= 2;
nptlp--;
}
dprintf("\n");
- if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0)
+ if (has_lp && moea64_has_lp_4k_16m)
break;
}
- if (len > 0) {
+ if (has_lp) {
moea64_large_page_shift = shift;
moea64_large_page_size = 1ULL << lp_size;
moea64_large_page_mask = moea64_large_page_size - 1;
Index: sys/vm/vm_fault.c
===================================================================
--- sys/vm/vm_fault.c
+++ sys/vm/vm_fault.c
@@ -497,7 +497,8 @@
pidx += npages, m = vm_page_next(&m[npages - 1])) {
vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset;
#if defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \
- __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv)
+ __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv) || \
+ defined(__powerpc64__)
psind = m->psind;
if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 ||
pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last ||

File Metadata

Mime Type
text/plain
Expires
Sat, Dec 13, 1:25 AM (16 h, 9 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
26917329
Default Alt Text
D25237.id73228.diff (49 KB)

Event Timeline