Index: head/sys/powerpc/aim/mmu_oea64.c =================================================================== --- head/sys/powerpc/aim/mmu_oea64.c (revision 279251) +++ head/sys/powerpc/aim/mmu_oea64.c (revision 279252) @@ -1,2695 +1,2649 @@ /*- - * Copyright (c) 2001 The NetBSD Foundation, Inc. + * Copyright (c) 2008-2015 Nathan Whitehorn * All rights reserved. * - * This code is derived from software contributed to The NetBSD Foundation - * by Matt Thomas of Allegro Networks, Inc. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -/*- - * Copyright (C) 1995, 1996 Wolfgang Solfrank. - * Copyright (C) 1995, 1996 TooLs GmbH. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by TooLs GmbH. - * 4. The name of TooLs GmbH may not be used to endorse or promote products - * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $NetBSD: pmap.c,v 1.28 2000/03/26 20:42:36 kleink Exp $ + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/*- - * Copyright (C) 2001 Benno Rice. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is also stored by the * logical address mapping module, this module may throw away valid virtual * to physical mappings at almost any time. However, invalidations of * mappings must be done as requested. * * In order to cope with hardware architectures which make virtual to * physical map invalidates expensive, this module may delay invalidate * reduced protection operations until such time as they are actually * necessary. This module is given full information as to which processors * are currently using which maps, and to when physical maps must be made * correct. */ #include "opt_compat.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmu_oea64.h" #include "mmu_if.h" #include "moea64_if.h" void moea64_release_vsid(uint64_t vsid); uintptr_t moea64_get_unique_vsid(void); #define DISABLE_TRANS(msr) msr = mfmsr(); mtmsr(msr & ~PSL_DR) #define ENABLE_TRANS(msr) mtmsr(msr) #define VSID_MAKE(sr, hash) ((sr) | (((hash) & 0xfffff) << 4)) #define VSID_TO_HASH(vsid) (((vsid) >> 4) & 0xfffff) #define VSID_HASH_MASK 0x0000007fffffffffULL /* * Locking semantics: - * -- Read lock: if no modifications are being made to either the PVO lists - * or page table or if any modifications being made result in internal - * changes (e.g. wiring, protection) such that the existence of the PVOs - * is unchanged and they remain associated with the same pmap (in which - * case the changes should be protected by the pmap lock) - * -- Write lock: required if PTEs/PVOs are being inserted or removed. + * + * There are two locks of interest: the page locks and the pmap locks, which + * protect their individual PVO lists and are locked in that order. The contents + * of all PVO entries are protected by the locks of their respective pmaps. + * The pmap of any PVO is guaranteed not to change so long as the PVO is linked + * into any list. + * */ -#define LOCK_TABLE_RD() rw_rlock(&moea64_table_lock) -#define UNLOCK_TABLE_RD() rw_runlock(&moea64_table_lock) -#define LOCK_TABLE_WR() rw_wlock(&moea64_table_lock) -#define UNLOCK_TABLE_WR() rw_wunlock(&moea64_table_lock) +#define PV_LOCK_COUNT PA_LOCK_COUNT*3 +static struct mtx_padalign pv_lock[PV_LOCK_COUNT]; + +#define PV_LOCKPTR(pa) ((struct mtx *)(&pv_lock[pa_index(pa) % PV_LOCK_COUNT])) +#define PV_LOCK(pa) mtx_lock(PV_LOCKPTR(pa)) +#define PV_UNLOCK(pa) mtx_unlock(PV_LOCKPTR(pa)) +#define PV_LOCKASSERT(pa) mtx_assert(PV_LOCKPTR(pa), MA_OWNED) +#define PV_PAGE_LOCK(m) PV_LOCK(VM_PAGE_TO_PHYS(m)) +#define PV_PAGE_UNLOCK(m) PV_UNLOCK(VM_PAGE_TO_PHYS(m)) +#define PV_PAGE_LOCKASSERT(m) PV_LOCKASSERT(VM_PAGE_TO_PHYS(m)) struct ofw_map { cell_t om_va; cell_t om_len; uint64_t om_pa; cell_t om_mode; }; extern unsigned char _etext[]; extern unsigned char _end[]; extern int ofw_real_mode; /* * Map of physical memory regions. */ static struct mem_region *regions; static struct mem_region *pregions; static u_int phys_avail_count; static int regions_sz, pregions_sz; extern void bs_remap_earlyboot(void); /* - * Lock for the pteg and pvo tables. + * Lock for the SLB tables. */ -struct rwlock moea64_table_lock; struct mtx moea64_slb_mutex; /* * PTEG data. */ u_int moea64_pteg_count; u_int moea64_pteg_mask; /* * PVO data. */ -struct pvo_head *moea64_pvo_table; /* pvo entries by pteg index */ -uma_zone_t moea64_upvo_zone; /* zone for pvo entries for unmanaged pages */ -uma_zone_t moea64_mpvo_zone; /* zone for pvo entries for managed pages */ +uma_zone_t moea64_pvo_zone; /* zone for pvo entries */ static struct pvo_entry *moea64_bpvo_pool; static int moea64_bpvo_pool_index = 0; static int moea64_bpvo_pool_size = 327680; TUNABLE_INT("machdep.moea64_bpvo_pool_size", &moea64_bpvo_pool_size); SYSCTL_INT(_machdep, OID_AUTO, moea64_allocated_bpvo_entries, CTLFLAG_RD, &moea64_bpvo_pool_index, 0, ""); #define VSID_NBPW (sizeof(u_int32_t) * 8) #ifdef __powerpc64__ #define NVSIDS (NPMAPS * 16) #define VSID_HASHMASK 0xffffffffUL #else #define NVSIDS NPMAPS #define VSID_HASHMASK 0xfffffUL #endif static u_int moea64_vsid_bitmap[NVSIDS / VSID_NBPW]; static boolean_t moea64_initialized = FALSE; /* * Statistics. */ u_int moea64_pte_valid = 0; u_int moea64_pte_overflow = 0; u_int moea64_pvo_entries = 0; u_int moea64_pvo_enter_calls = 0; u_int moea64_pvo_remove_calls = 0; SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_valid, CTLFLAG_RD, &moea64_pte_valid, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_overflow, CTLFLAG_RD, &moea64_pte_overflow, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_entries, CTLFLAG_RD, &moea64_pvo_entries, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_enter_calls, CTLFLAG_RD, &moea64_pvo_enter_calls, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_remove_calls, CTLFLAG_RD, &moea64_pvo_remove_calls, 0, ""); vm_offset_t moea64_scratchpage_va[2]; struct pvo_entry *moea64_scratchpage_pvo[2]; -uintptr_t moea64_scratchpage_pte[2]; struct mtx moea64_scratchpage_mtx; uint64_t moea64_large_page_mask = 0; uint64_t moea64_large_page_size = 0; int moea64_large_page_shift = 0; /* * PVO calls. */ -static int moea64_pvo_enter(mmu_t, pmap_t, uma_zone_t, struct pvo_head *, - vm_offset_t, vm_offset_t, uint64_t, int, int8_t); -static void moea64_pvo_remove(mmu_t, struct pvo_entry *); +static int moea64_pvo_enter(mmu_t mmu, struct pvo_entry *pvo, + struct pvo_head *pvo_head); +static void moea64_pvo_remove_from_pmap(mmu_t mmu, struct pvo_entry *pvo); +static void moea64_pvo_remove_from_page(mmu_t mmu, struct pvo_entry *pvo); static struct pvo_entry *moea64_pvo_find_va(pmap_t, vm_offset_t); /* * Utility routines. */ -static boolean_t moea64_query_bit(mmu_t, vm_page_t, u_int64_t); -static u_int moea64_clear_bit(mmu_t, vm_page_t, u_int64_t); +static boolean_t moea64_query_bit(mmu_t, vm_page_t, uint64_t); +static u_int moea64_clear_bit(mmu_t, vm_page_t, uint64_t); static void moea64_kremove(mmu_t, vm_offset_t); static void moea64_syncicache(mmu_t, pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_size_t sz); /* * Kernel MMU interface */ void moea64_clear_modify(mmu_t, vm_page_t); void moea64_copy_page(mmu_t, vm_page_t, vm_page_t); void moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize); int moea64_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int flags, int8_t psind); void moea64_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); void moea64_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t); vm_paddr_t moea64_extract(mmu_t, pmap_t, vm_offset_t); vm_page_t moea64_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t); void moea64_init(mmu_t); boolean_t moea64_is_modified(mmu_t, vm_page_t); boolean_t moea64_is_prefaultable(mmu_t, pmap_t, vm_offset_t); boolean_t moea64_is_referenced(mmu_t, vm_page_t); int moea64_ts_referenced(mmu_t, vm_page_t); vm_offset_t moea64_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int); boolean_t moea64_page_exists_quick(mmu_t, pmap_t, vm_page_t); int moea64_page_wired_mappings(mmu_t, vm_page_t); void moea64_pinit(mmu_t, pmap_t); void moea64_pinit0(mmu_t, pmap_t); void moea64_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); void moea64_qenter(mmu_t, vm_offset_t, vm_page_t *, int); void moea64_qremove(mmu_t, vm_offset_t, int); void moea64_release(mmu_t, pmap_t); void moea64_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t); void moea64_remove_pages(mmu_t, pmap_t); void moea64_remove_all(mmu_t, vm_page_t); void moea64_remove_write(mmu_t, vm_page_t); void moea64_unwire(mmu_t, pmap_t, vm_offset_t, vm_offset_t); void moea64_zero_page(mmu_t, vm_page_t); void moea64_zero_page_area(mmu_t, vm_page_t, int, int); void moea64_zero_page_idle(mmu_t, vm_page_t); void moea64_activate(mmu_t, struct thread *); void moea64_deactivate(mmu_t, struct thread *); void *moea64_mapdev(mmu_t, vm_paddr_t, vm_size_t); void *moea64_mapdev_attr(mmu_t, vm_offset_t, vm_size_t, vm_memattr_t); void moea64_unmapdev(mmu_t, vm_offset_t, vm_size_t); vm_paddr_t moea64_kextract(mmu_t, vm_offset_t); void moea64_page_set_memattr(mmu_t, vm_page_t m, vm_memattr_t ma); void moea64_kenter_attr(mmu_t, vm_offset_t, vm_offset_t, vm_memattr_t ma); void moea64_kenter(mmu_t, vm_offset_t, vm_paddr_t); boolean_t moea64_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t); static void moea64_sync_icache(mmu_t, pmap_t, vm_offset_t, vm_size_t); void moea64_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va); void moea64_scan_init(mmu_t mmu); static mmu_method_t moea64_methods[] = { MMUMETHOD(mmu_clear_modify, moea64_clear_modify), MMUMETHOD(mmu_copy_page, moea64_copy_page), MMUMETHOD(mmu_copy_pages, moea64_copy_pages), MMUMETHOD(mmu_enter, moea64_enter), MMUMETHOD(mmu_enter_object, moea64_enter_object), MMUMETHOD(mmu_enter_quick, moea64_enter_quick), MMUMETHOD(mmu_extract, moea64_extract), MMUMETHOD(mmu_extract_and_hold, moea64_extract_and_hold), MMUMETHOD(mmu_init, moea64_init), MMUMETHOD(mmu_is_modified, moea64_is_modified), MMUMETHOD(mmu_is_prefaultable, moea64_is_prefaultable), MMUMETHOD(mmu_is_referenced, moea64_is_referenced), MMUMETHOD(mmu_ts_referenced, moea64_ts_referenced), MMUMETHOD(mmu_map, moea64_map), MMUMETHOD(mmu_page_exists_quick,moea64_page_exists_quick), MMUMETHOD(mmu_page_wired_mappings,moea64_page_wired_mappings), MMUMETHOD(mmu_pinit, moea64_pinit), MMUMETHOD(mmu_pinit0, moea64_pinit0), MMUMETHOD(mmu_protect, moea64_protect), MMUMETHOD(mmu_qenter, moea64_qenter), MMUMETHOD(mmu_qremove, moea64_qremove), MMUMETHOD(mmu_release, moea64_release), MMUMETHOD(mmu_remove, moea64_remove), MMUMETHOD(mmu_remove_pages, moea64_remove_pages), MMUMETHOD(mmu_remove_all, moea64_remove_all), MMUMETHOD(mmu_remove_write, moea64_remove_write), MMUMETHOD(mmu_sync_icache, moea64_sync_icache), MMUMETHOD(mmu_unwire, moea64_unwire), MMUMETHOD(mmu_zero_page, moea64_zero_page), MMUMETHOD(mmu_zero_page_area, moea64_zero_page_area), MMUMETHOD(mmu_zero_page_idle, moea64_zero_page_idle), MMUMETHOD(mmu_activate, moea64_activate), MMUMETHOD(mmu_deactivate, moea64_deactivate), MMUMETHOD(mmu_page_set_memattr, moea64_page_set_memattr), /* Internal interfaces */ MMUMETHOD(mmu_mapdev, moea64_mapdev), MMUMETHOD(mmu_mapdev_attr, moea64_mapdev_attr), MMUMETHOD(mmu_unmapdev, moea64_unmapdev), MMUMETHOD(mmu_kextract, moea64_kextract), MMUMETHOD(mmu_kenter, moea64_kenter), MMUMETHOD(mmu_kenter_attr, moea64_kenter_attr), MMUMETHOD(mmu_dev_direct_mapped,moea64_dev_direct_mapped), MMUMETHOD(mmu_scan_init, moea64_scan_init), MMUMETHOD(mmu_dumpsys_map, moea64_dumpsys_map), { 0, 0 } }; MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods, 0); -static __inline u_int -va_to_pteg(uint64_t vsid, vm_offset_t addr, int large) +static struct pvo_head * +vm_page_to_pvoh(vm_page_t m) { + + mtx_assert(PV_LOCKPTR(VM_PAGE_TO_PHYS(m)), MA_OWNED); + return (&m->md.mdpg_pvoh); +} + +static struct pvo_entry * +alloc_pvo_entry(int bootstrap) +{ + struct pvo_entry *pvo; + + if (!moea64_initialized || bootstrap) { + if (moea64_bpvo_pool_index >= moea64_bpvo_pool_size) { + panic("moea64_enter: bpvo pool exhausted, %d, %d, %zd", + moea64_bpvo_pool_index, moea64_bpvo_pool_size, + moea64_bpvo_pool_size * sizeof(struct pvo_entry)); + } + pvo = &moea64_bpvo_pool[ + atomic_fetchadd_int(&moea64_bpvo_pool_index, 1)]; + bzero(pvo, sizeof(*pvo)); + pvo->pvo_vaddr = PVO_BOOTSTRAP; + } else { + pvo = uma_zalloc(moea64_pvo_zone, M_NOWAIT); + bzero(pvo, sizeof(*pvo)); + } + + return (pvo); +} + + +static void +init_pvo_entry(struct pvo_entry *pvo, pmap_t pmap, vm_offset_t va) +{ + uint64_t vsid; uint64_t hash; int shift; - shift = large ? moea64_large_page_shift : ADDR_PIDX_SHFT; - hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)addr & ADDR_PIDX) >> - shift); - return (hash & moea64_pteg_mask); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + pvo->pvo_pmap = pmap; + va &= ~ADDR_POFF; + pvo->pvo_vaddr |= va; + vsid = va_to_vsid(pmap, va); + pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT) + | (vsid << 16); + + shift = (pvo->pvo_vaddr & PVO_LARGE) ? moea64_large_page_shift : + ADDR_PIDX_SHFT; + hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift); + pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3; } -static __inline struct pvo_head * -vm_page_to_pvoh(vm_page_t m) +static void +free_pvo_entry(struct pvo_entry *pvo) { - return (&m->md.mdpg_pvoh); + if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP)) + uma_zfree(moea64_pvo_zone, pvo); } -static __inline void -moea64_pte_create(struct lpte *pt, uint64_t vsid, vm_offset_t va, - uint64_t pte_lo, int flags) +void +moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte) { - /* - * Construct a PTE. Default to IMB initially. Valid bit only gets - * set when the real pte is set in memory. - * - * Note: Don't set the valid bit for correct operation of tlb update. - */ - pt->pte_hi = (vsid << LPTE_VSID_SHIFT) | - (((uint64_t)(va & ADDR_PIDX) >> ADDR_API_SHFT64) & LPTE_API); + lpte->pte_hi = (pvo->pvo_vpn >> (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) & + LPTE_AVPN_MASK; + lpte->pte_hi |= LPTE_VALID; + + if (pvo->pvo_vaddr & PVO_LARGE) + lpte->pte_hi |= LPTE_BIG; + if (pvo->pvo_vaddr & PVO_WIRED) + lpte->pte_hi |= LPTE_WIRED; + if (pvo->pvo_vaddr & PVO_HID) + lpte->pte_hi |= LPTE_HID; - if (flags & PVO_LARGE) - pt->pte_hi |= LPTE_BIG; + lpte->pte_lo = pvo->pvo_pte.pa; /* Includes WIMG bits */ + if (pvo->pvo_pte.prot & VM_PROT_WRITE) + lpte->pte_lo |= LPTE_BW; + else + lpte->pte_lo |= LPTE_BR; - pt->pte_lo = pte_lo; + if (!(pvo->pvo_pte.prot & VM_PROT_EXECUTE)) + lpte->pte_lo |= LPTE_NOEXEC; } static __inline uint64_t moea64_calc_wimg(vm_offset_t pa, vm_memattr_t ma) { uint64_t pte_lo; int i; if (ma != VM_MEMATTR_DEFAULT) { switch (ma) { case VM_MEMATTR_UNCACHEABLE: return (LPTE_I | LPTE_G); case VM_MEMATTR_WRITE_COMBINING: case VM_MEMATTR_WRITE_BACK: case VM_MEMATTR_PREFETCHABLE: return (LPTE_I); case VM_MEMATTR_WRITE_THROUGH: return (LPTE_W | LPTE_M); } } /* * Assume the page is cache inhibited and access is guarded unless * it's in our available memory array. */ pte_lo = LPTE_I | LPTE_G; for (i = 0; i < pregions_sz; i++) { if ((pa >= pregions[i].mr_start) && (pa < (pregions[i].mr_start + pregions[i].mr_size))) { pte_lo &= ~(LPTE_I | LPTE_G); pte_lo |= LPTE_M; break; } } return pte_lo; } /* * Quick sort callout for comparing memory regions. */ static int om_cmp(const void *a, const void *b); static int om_cmp(const void *a, const void *b) { const struct ofw_map *mapa; const struct ofw_map *mapb; mapa = a; mapb = b; if (mapa->om_pa < mapb->om_pa) return (-1); else if (mapa->om_pa > mapb->om_pa) return (1); else return (0); } static void moea64_add_ofw_mappings(mmu_t mmup, phandle_t mmu, size_t sz) { struct ofw_map translations[sz/(4*sizeof(cell_t))]; /*>= 4 cells per */ pcell_t acells, trans_cells[sz/sizeof(cell_t)]; + struct pvo_entry *pvo; register_t msr; vm_offset_t off; vm_paddr_t pa_base; int i, j; bzero(translations, sz); OF_getprop(OF_finddevice("/"), "#address-cells", &acells, sizeof(acells)); if (OF_getprop(mmu, "translations", trans_cells, sz) == -1) panic("moea64_bootstrap: can't get ofw translations"); CTR0(KTR_PMAP, "moea64_add_ofw_mappings: translations"); sz /= sizeof(cell_t); for (i = 0, j = 0; i < sz; j++) { translations[j].om_va = trans_cells[i++]; translations[j].om_len = trans_cells[i++]; translations[j].om_pa = trans_cells[i++]; if (acells == 2) { translations[j].om_pa <<= 32; translations[j].om_pa |= trans_cells[i++]; } translations[j].om_mode = trans_cells[i++]; } KASSERT(i == sz, ("Translations map has incorrect cell count (%d/%zd)", i, sz)); sz = j; qsort(translations, sz, sizeof (*translations), om_cmp); for (i = 0; i < sz; i++) { pa_base = translations[i].om_pa; #ifndef __powerpc64__ if ((translations[i].om_pa >> 32) != 0) panic("OFW translations above 32-bit boundary!"); #endif if (pa_base % PAGE_SIZE) panic("OFW translation not page-aligned (phys)!"); if (translations[i].om_va % PAGE_SIZE) panic("OFW translation not page-aligned (virt)!"); CTR3(KTR_PMAP, "translation: pa=%#zx va=%#x len=%#x", pa_base, translations[i].om_va, translations[i].om_len); /* Now enter the pages for this mapping */ DISABLE_TRANS(msr); for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) { /* If this address is direct-mapped, skip remapping */ if (hw_direct_map && translations[i].om_va == pa_base && moea64_calc_wimg(pa_base + off, VM_MEMATTR_DEFAULT) == LPTE_M) continue; - if (moea64_pvo_find_va(kernel_pmap, - translations[i].om_va + off) != NULL) + PMAP_LOCK(kernel_pmap); + pvo = moea64_pvo_find_va(kernel_pmap, + translations[i].om_va + off); + PMAP_UNLOCK(kernel_pmap); + if (pvo != NULL) continue; moea64_kenter(mmup, translations[i].om_va + off, pa_base + off); } ENABLE_TRANS(msr); } } #ifdef __powerpc64__ static void moea64_probe_large_page(void) { uint16_t pvr = mfpvr() >> 16; switch (pvr) { case IBM970: case IBM970FX: case IBM970MP: powerpc_sync(); isync(); mtspr(SPR_HID4, mfspr(SPR_HID4) & ~HID4_970_DISABLE_LG_PG); powerpc_sync(); isync(); /* FALLTHROUGH */ default: moea64_large_page_size = 0x1000000; /* 16 MB */ moea64_large_page_shift = 24; } moea64_large_page_mask = moea64_large_page_size - 1; } static void moea64_bootstrap_slb_prefault(vm_offset_t va, int large) { struct slb *cache; struct slb entry; uint64_t esid, slbe; uint64_t i; cache = PCPU_GET(slb); esid = va >> ADDR_SR_SHFT; slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; for (i = 0; i < 64; i++) { if (cache[i].slbe == (slbe | i)) return; } entry.slbe = slbe; entry.slbv = KERNEL_VSID(esid) << SLBV_VSID_SHIFT; if (large) entry.slbv |= SLBV_L; slb_insert_kernel(entry.slbe, entry.slbv); } #endif static void moea64_setup_direct_map(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { + struct pvo_entry *pvo; register_t msr; vm_paddr_t pa; vm_offset_t size, off; uint64_t pte_lo; int i; if (moea64_large_page_size == 0) hw_direct_map = 0; DISABLE_TRANS(msr); if (hw_direct_map) { - LOCK_TABLE_WR(); PMAP_LOCK(kernel_pmap); for (i = 0; i < pregions_sz; i++) { for (pa = pregions[i].mr_start; pa < pregions[i].mr_start + pregions[i].mr_size; pa += moea64_large_page_size) { pte_lo = LPTE_M; + pvo = alloc_pvo_entry(1 /* bootstrap */); + pvo->pvo_vaddr |= PVO_WIRED | PVO_LARGE; + init_pvo_entry(pvo, kernel_pmap, pa); + /* * Set memory access as guarded if prefetch within * the page could exit the available physmem area. */ if (pa & moea64_large_page_mask) { pa &= moea64_large_page_mask; pte_lo |= LPTE_G; } if (pa + moea64_large_page_size > pregions[i].mr_start + pregions[i].mr_size) pte_lo |= LPTE_G; - moea64_pvo_enter(mmup, kernel_pmap, moea64_upvo_zone, - NULL, pa, pa, pte_lo, - PVO_WIRED | PVO_LARGE, 0); + pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | + VM_PROT_EXECUTE; + pvo->pvo_pte.pa = pa | pte_lo; + moea64_pvo_enter(mmup, pvo, NULL); } } PMAP_UNLOCK(kernel_pmap); - UNLOCK_TABLE_WR(); } else { - size = sizeof(struct pvo_head) * moea64_pteg_count; - off = (vm_offset_t)(moea64_pvo_table); - for (pa = off; pa < off + size; pa += PAGE_SIZE) - moea64_kenter(mmup, pa, pa); size = moea64_bpvo_pool_size*sizeof(struct pvo_entry); off = (vm_offset_t)(moea64_bpvo_pool); for (pa = off; pa < off + size; pa += PAGE_SIZE) moea64_kenter(mmup, pa, pa); /* * Map certain important things, like ourselves. * * NOTE: We do not map the exception vector space. That code is * used only in real mode, and leaving it unmapped allows us to * catch NULL pointer deferences, instead of making NULL a valid * address. */ for (pa = kernelstart & ~PAGE_MASK; pa < kernelend; pa += PAGE_SIZE) moea64_kenter(mmup, pa, pa); } ENABLE_TRANS(msr); /* * Allow user to override unmapped_buf_allowed for testing. * XXXKIB Only direct map implementation was tested. */ if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", &unmapped_buf_allowed)) unmapped_buf_allowed = hw_direct_map; } void moea64_early_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { int i, j; vm_size_t physsz, hwphyssz; #ifndef __powerpc64__ /* We don't have a direct map since there is no BAT */ hw_direct_map = 0; /* Make sure battable is zero, since we have no BAT */ for (i = 0; i < 16; i++) { battable[i].batu = 0; battable[i].batl = 0; } #else moea64_probe_large_page(); /* Use a direct map if we have large page support */ if (moea64_large_page_size > 0) hw_direct_map = 1; else hw_direct_map = 0; #endif /* Get physical memory regions from firmware */ mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); CTR0(KTR_PMAP, "moea64_bootstrap: physical memory"); if (sizeof(phys_avail)/sizeof(phys_avail[0]) < regions_sz) panic("moea64_bootstrap: phys_avail too small"); phys_avail_count = 0; physsz = 0; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); for (i = 0, j = 0; i < regions_sz; i++, j += 2) { CTR3(KTR_PMAP, "region: %#zx - %#zx (%#zx)", regions[i].mr_start, regions[i].mr_start + regions[i].mr_size, regions[i].mr_size); if (hwphyssz != 0 && (physsz + regions[i].mr_size) >= hwphyssz) { if (physsz < hwphyssz) { phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + hwphyssz - physsz; physsz = hwphyssz; phys_avail_count++; } break; } phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; phys_avail_count++; physsz += regions[i].mr_size; } /* Check for overlap with the kernel and exception vectors */ for (j = 0; j < 2*phys_avail_count; j+=2) { if (phys_avail[j] < EXC_LAST) phys_avail[j] += EXC_LAST; if (kernelstart >= phys_avail[j] && kernelstart < phys_avail[j+1]) { if (kernelend < phys_avail[j+1]) { phys_avail[2*phys_avail_count] = (kernelend & ~PAGE_MASK) + PAGE_SIZE; phys_avail[2*phys_avail_count + 1] = phys_avail[j+1]; phys_avail_count++; } phys_avail[j+1] = kernelstart & ~PAGE_MASK; } if (kernelend >= phys_avail[j] && kernelend < phys_avail[j+1]) { if (kernelstart > phys_avail[j]) { phys_avail[2*phys_avail_count] = phys_avail[j]; phys_avail[2*phys_avail_count + 1] = kernelstart & ~PAGE_MASK; phys_avail_count++; } phys_avail[j] = (kernelend & ~PAGE_MASK) + PAGE_SIZE; } } physmem = btoc(physsz); #ifdef PTEGCOUNT moea64_pteg_count = PTEGCOUNT; #else moea64_pteg_count = 0x1000; while (moea64_pteg_count < physmem) moea64_pteg_count <<= 1; moea64_pteg_count >>= 1; #endif /* PTEGCOUNT */ } void moea64_mid_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { - vm_size_t size; - register_t msr; int i; /* * Set PTEG mask */ moea64_pteg_mask = moea64_pteg_count - 1; /* - * Allocate pv/overflow lists. + * Initialize SLB table lock and page locks */ - size = sizeof(struct pvo_head) * moea64_pteg_count; - - moea64_pvo_table = (struct pvo_head *)moea64_bootstrap_alloc(size, - PAGE_SIZE); - CTR1(KTR_PMAP, "moea64_bootstrap: PVO table at %p", moea64_pvo_table); - - DISABLE_TRANS(msr); - for (i = 0; i < moea64_pteg_count; i++) - LIST_INIT(&moea64_pvo_table[i]); - ENABLE_TRANS(msr); - - /* - * Initialize the lock that synchronizes access to the pteg and pvo - * tables. - */ - rw_init_flags(&moea64_table_lock, "pmap tables", RW_RECURSE); mtx_init(&moea64_slb_mutex, "SLB table", NULL, MTX_DEF); + for (i = 0; i < PV_LOCK_COUNT; i++) + mtx_init(&pv_lock[i], "page pv", NULL, MTX_DEF); /* - * Initialise the unmanaged pvo pool. + * Initialise the bootstrap pvo pool. */ moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc( moea64_bpvo_pool_size*sizeof(struct pvo_entry), 0); moea64_bpvo_pool_index = 0; /* * Make sure kernel vsid is allocated as well as VSID 0. */ #ifndef __powerpc64__ moea64_vsid_bitmap[(KERNEL_VSIDBITS & (NVSIDS - 1)) / VSID_NBPW] |= 1 << (KERNEL_VSIDBITS % VSID_NBPW); moea64_vsid_bitmap[0] |= 1; #endif /* * Initialize the kernel pmap (which is statically allocated). */ #ifdef __powerpc64__ for (i = 0; i < 64; i++) { pcpup->pc_slb[i].slbv = 0; pcpup->pc_slb[i].slbe = 0; } #else for (i = 0; i < 16; i++) kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i; #endif kernel_pmap->pmap_phys = kernel_pmap; CPU_FILL(&kernel_pmap->pm_active); RB_INIT(&kernel_pmap->pmap_pvo); PMAP_LOCK_INIT(kernel_pmap); /* * Now map in all the other buffers we allocated earlier */ moea64_setup_direct_map(mmup, kernelstart, kernelend); } void moea64_late_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { ihandle_t mmui; phandle_t chosen; phandle_t mmu; ssize_t sz; int i; vm_offset_t pa, va; void *dpcpu; /* * Set up the Open Firmware pmap and add its mappings if not in real * mode. */ chosen = OF_finddevice("/chosen"); if (!ofw_real_mode && chosen != -1 && OF_getprop(chosen, "mmu", &mmui, 4) != -1) { mmu = OF_instance_to_package(mmui); if (mmu == -1 || (sz = OF_getproplen(mmu, "translations")) == -1) sz = 0; if (sz > 6144 /* tmpstksz - 2 KB headroom */) panic("moea64_bootstrap: too many ofw translations"); if (sz > 0) moea64_add_ofw_mappings(mmup, mmu, sz); } /* * Calculate the last available physical address. */ for (i = 0; phys_avail[i + 2] != 0; i += 2) ; Maxmem = powerpc_btop(phys_avail[i + 1]); /* * Initialize MMU and remap early physical mappings */ MMU_CPU_BOOTSTRAP(mmup,0); mtmsr(mfmsr() | PSL_DR | PSL_IR); pmap_bootstrapped++; bs_remap_earlyboot(); /* * Set the start and end of kva. */ virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; /* * Map the entire KVA range into the SLB. We must not fault there. */ #ifdef __powerpc64__ for (va = virtual_avail; va < virtual_end; va += SEGMENT_LENGTH) moea64_bootstrap_slb_prefault(va, 0); #endif /* * Figure out how far we can extend virtual_end into segment 16 * without running into existing mappings. Segment 16 is guaranteed * to contain neither RAM nor devices (at least on Apple hardware), * but will generally contain some OFW mappings we should not * step on. */ #ifndef __powerpc64__ /* KVA is in high memory on PPC64 */ PMAP_LOCK(kernel_pmap); while (virtual_end < VM_MAX_KERNEL_ADDRESS && moea64_pvo_find_va(kernel_pmap, virtual_end+1) == NULL) virtual_end += PAGE_SIZE; PMAP_UNLOCK(kernel_pmap); #endif /* * Allocate a kernel stack with a guard page for thread0 and map it * into the kernel page map. */ pa = moea64_bootstrap_alloc(KSTACK_PAGES * PAGE_SIZE, PAGE_SIZE); va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; virtual_avail = va + KSTACK_PAGES * PAGE_SIZE; CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); thread0.td_kstack = va; thread0.td_kstack_pages = KSTACK_PAGES; for (i = 0; i < KSTACK_PAGES; i++) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the message buffer. */ pa = msgbuf_phys = moea64_bootstrap_alloc(msgbufsize, PAGE_SIZE); msgbufp = (struct msgbuf *)virtual_avail; va = virtual_avail; virtual_avail += round_page(msgbufsize); while (va < virtual_avail) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the dynamic percpu area. */ pa = moea64_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE); dpcpu = (void *)virtual_avail; va = virtual_avail; virtual_avail += DPCPU_SIZE; while (va < virtual_avail) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } dpcpu_init(dpcpu, 0); /* * Allocate some things for page zeroing. We put this directly - * in the page table, marked with LPTE_LOCKED, to avoid any + * in the page table and use MOEA64_PTE_REPLACE to avoid any * of the PVO book-keeping or other parts of the VM system * from even knowing that this hack exists. */ if (!hw_direct_map) { mtx_init(&moea64_scratchpage_mtx, "pvo zero page", NULL, MTX_DEF); for (i = 0; i < 2; i++) { moea64_scratchpage_va[i] = (virtual_end+1) - PAGE_SIZE; virtual_end -= PAGE_SIZE; moea64_kenter(mmup, moea64_scratchpage_va[i], 0); + PMAP_LOCK(kernel_pmap); moea64_scratchpage_pvo[i] = moea64_pvo_find_va( kernel_pmap, (vm_offset_t)moea64_scratchpage_va[i]); - LOCK_TABLE_RD(); - moea64_scratchpage_pte[i] = MOEA64_PVO_TO_PTE( - mmup, moea64_scratchpage_pvo[i]); - moea64_scratchpage_pvo[i]->pvo_pte.lpte.pte_hi - |= LPTE_LOCKED; - MOEA64_PTE_CHANGE(mmup, moea64_scratchpage_pte[i], - &moea64_scratchpage_pvo[i]->pvo_pte.lpte, - moea64_scratchpage_pvo[i]->pvo_vpn); - UNLOCK_TABLE_RD(); + PMAP_UNLOCK(kernel_pmap); } } } /* - * Activate a user pmap. The pmap must be activated before its address - * space can be accessed in any way. + * Activate a user pmap. This mostly involves setting some non-CPU + * state. */ void moea64_activate(mmu_t mmu, struct thread *td) { pmap_t pm; pm = &td->td_proc->p_vmspace->vm_pmap; CPU_SET(PCPU_GET(cpuid), &pm->pm_active); #ifdef __powerpc64__ PCPU_SET(userslb, pm->pm_slb); #else PCPU_SET(curpmap, pm->pmap_phys); #endif } void moea64_deactivate(mmu_t mmu, struct thread *td) { pmap_t pm; pm = &td->td_proc->p_vmspace->vm_pmap; CPU_CLR(PCPU_GET(cpuid), &pm->pm_active); #ifdef __powerpc64__ PCPU_SET(userslb, NULL); #else PCPU_SET(curpmap, NULL); #endif } void moea64_unwire(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry key, *pvo; - uintptr_t pt; + vm_page_t m; + int64_t refchg; - LOCK_TABLE_RD(); - PMAP_LOCK(pm); key.pvo_vaddr = sva; + PMAP_LOCK(pm); for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { - pt = MOEA64_PVO_TO_PTE(mmu, pvo); if ((pvo->pvo_vaddr & PVO_WIRED) == 0) panic("moea64_unwire: pvo %p is missing PVO_WIRED", pvo); pvo->pvo_vaddr &= ~PVO_WIRED; - if ((pvo->pvo_pte.lpte.pte_hi & LPTE_WIRED) == 0) - panic("moea64_unwire: pte %p is missing LPTE_WIRED", - &pvo->pvo_pte.lpte); - pvo->pvo_pte.lpte.pte_hi &= ~LPTE_WIRED; - if (pt != -1) { - /* - * The PTE's wired attribute is not a hardware - * feature, so there is no need to invalidate any TLB - * entries. - */ - MOEA64_PTE_CHANGE(mmu, pt, &pvo->pvo_pte.lpte, - pvo->pvo_vpn); + refchg = MOEA64_PTE_REPLACE(mmu, pvo, 0 /* No invalidation */); + if ((pvo->pvo_vaddr & PVO_MANAGED) && + (pvo->pvo_pte.prot & VM_PROT_WRITE)) { + if (refchg < 0) + refchg = LPTE_CHG; + m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); + + refchg |= atomic_readandclear_32(&m->md.mdpg_attrs); + if (refchg & LPTE_CHG) + vm_page_dirty(m); + if (refchg & LPTE_REF) + vm_page_aflag_set(m, PGA_REFERENCED); } pm->pm_stats.wired_count--; } - UNLOCK_TABLE_RD(); PMAP_UNLOCK(pm); } /* * This goes through and sets the physical address of our * special scratch PTE to the PA we want to zero or copy. Because * of locking issues (this can get called in pvo_enter() by * the UMA allocator), we can't use most other utility functions here */ static __inline void moea64_set_scratchpage_pa(mmu_t mmup, int which, vm_offset_t pa) { KASSERT(!hw_direct_map, ("Using OEA64 scratchpage with a direct map!")); mtx_assert(&moea64_scratchpage_mtx, MA_OWNED); - moea64_scratchpage_pvo[which]->pvo_pte.lpte.pte_lo &= - ~(LPTE_WIMG | LPTE_RPGN); - moea64_scratchpage_pvo[which]->pvo_pte.lpte.pte_lo |= + moea64_scratchpage_pvo[which]->pvo_pte.pa = moea64_calc_wimg(pa, VM_MEMATTR_DEFAULT) | (uint64_t)pa; - MOEA64_PTE_CHANGE(mmup, moea64_scratchpage_pte[which], - &moea64_scratchpage_pvo[which]->pvo_pte.lpte, - moea64_scratchpage_pvo[which]->pvo_vpn); + MOEA64_PTE_REPLACE(mmup, moea64_scratchpage_pvo[which], + MOEA64_PTE_INVALIDATE); isync(); } void moea64_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst) { vm_offset_t dst; vm_offset_t src; dst = VM_PAGE_TO_PHYS(mdst); src = VM_PAGE_TO_PHYS(msrc); if (hw_direct_map) { bcopy((void *)src, (void *)dst, PAGE_SIZE); } else { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, src); moea64_set_scratchpage_pa(mmu, 1, dst); bcopy((void *)moea64_scratchpage_va[0], (void *)moea64_scratchpage_va[1], PAGE_SIZE); mtx_unlock(&moea64_scratchpage_mtx); } } static inline void moea64_copy_pages_dmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); a_cp = (char *)VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]) + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); b_cp = (char *)VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]) + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } static inline void moea64_copy_pages_nodmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; mtx_lock(&moea64_scratchpage_mtx); while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); moea64_set_scratchpage_pa(mmu, 0, VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])); a_cp = (char *)moea64_scratchpage_va[0] + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); moea64_set_scratchpage_pa(mmu, 1, VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])); b_cp = (char *)moea64_scratchpage_va[1] + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } mtx_unlock(&moea64_scratchpage_mtx); } void moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { if (hw_direct_map) { moea64_copy_pages_dmap(mmu, ma, a_offset, mb, b_offset, xfersize); } else { moea64_copy_pages_nodmap(mmu, ma, a_offset, mb, b_offset, xfersize); } } void moea64_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size) { vm_offset_t pa = VM_PAGE_TO_PHYS(m); if (size + off > PAGE_SIZE) panic("moea64_zero_page: size + off > PAGE_SIZE"); if (hw_direct_map) { bzero((caddr_t)pa + off, size); } else { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, pa); bzero((caddr_t)moea64_scratchpage_va[0] + off, size); mtx_unlock(&moea64_scratchpage_mtx); } } /* * Zero a page of physical memory by temporarily mapping it */ void moea64_zero_page(mmu_t mmu, vm_page_t m) { vm_offset_t pa = VM_PAGE_TO_PHYS(m); vm_offset_t va, off; if (!hw_direct_map) { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, pa); va = moea64_scratchpage_va[0]; } else { va = pa; } for (off = 0; off < PAGE_SIZE; off += cacheline_size) __asm __volatile("dcbz 0,%0" :: "r"(va + off)); if (!hw_direct_map) mtx_unlock(&moea64_scratchpage_mtx); } void moea64_zero_page_idle(mmu_t mmu, vm_page_t m) { moea64_zero_page(mmu, m); } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. */ int moea64_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { + struct pvo_entry *pvo, *oldpvo; struct pvo_head *pvo_head; - uma_zone_t zone; uint64_t pte_lo; - u_int pvo_flags; int error; if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); + pvo = alloc_pvo_entry(0); + pvo->pvo_pmap = NULL; /* to be filled in later */ + pvo->pvo_pte.prot = prot; + + pte_lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m)); + pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | pte_lo; + + if ((flags & PMAP_ENTER_WIRED) != 0) + pvo->pvo_vaddr |= PVO_WIRED; + if ((m->oflags & VPO_UNMANAGED) != 0 || !moea64_initialized) { pvo_head = NULL; - zone = moea64_upvo_zone; - pvo_flags = 0; } else { - pvo_head = vm_page_to_pvoh(m); - zone = moea64_mpvo_zone; - pvo_flags = PVO_MANAGED; + pvo_head = &m->md.mdpg_pvoh; + pvo->pvo_vaddr |= PVO_MANAGED; } + + for (;;) { + PV_PAGE_LOCK(m); + PMAP_LOCK(pmap); + if (pvo->pvo_pmap == NULL) + init_pvo_entry(pvo, pmap, va); + if (prot & VM_PROT_WRITE) + if (pmap_bootstrapped && + (m->oflags & VPO_UNMANAGED) == 0) + vm_page_aflag_set(m, PGA_WRITEABLE); - pte_lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m)); + oldpvo = moea64_pvo_find_va(pmap, va); + if (oldpvo != NULL) { + if (oldpvo->pvo_vaddr == pvo->pvo_vaddr && + oldpvo->pvo_pte.pa == pvo->pvo_pte.pa && + oldpvo->pvo_pte.prot == prot) { + /* Identical mapping already exists */ + error = 0; - if (prot & VM_PROT_WRITE) { - pte_lo |= LPTE_BW; - if (pmap_bootstrapped && - (m->oflags & VPO_UNMANAGED) == 0) - vm_page_aflag_set(m, PGA_WRITEABLE); - } else - pte_lo |= LPTE_BR; + /* If not in page table, reinsert it */ + if (MOEA64_PTE_SYNCH(mmu, oldpvo) < 0) { + moea64_pte_overflow--; + MOEA64_PTE_INSERT(mmu, oldpvo); + } - if ((prot & VM_PROT_EXECUTE) == 0) - pte_lo |= LPTE_NOEXEC; + /* Then just clean up and go home */ + PV_PAGE_UNLOCK(m); + PMAP_UNLOCK(pmap); + free_pvo_entry(pvo); + break; + } - if ((flags & PMAP_ENTER_WIRED) != 0) - pvo_flags |= PVO_WIRED; - - for (;;) { - LOCK_TABLE_WR(); - PMAP_LOCK(pmap); - error = moea64_pvo_enter(mmu, pmap, zone, pvo_head, va, - VM_PAGE_TO_PHYS(m), pte_lo, pvo_flags, psind); + /* Otherwise, need to kill it first */ + KASSERT(oldpvo->pvo_pmap == pmap, ("pmap of old " + "mapping does not match new mapping")); + moea64_pvo_remove_from_pmap(mmu, oldpvo); + } + error = moea64_pvo_enter(mmu, pvo, pvo_head); + PV_PAGE_UNLOCK(m); PMAP_UNLOCK(pmap); - UNLOCK_TABLE_WR(); + + /* Free any dead pages */ + if (oldpvo != NULL) { + PV_LOCK(oldpvo->pvo_pte.pa & LPTE_RPGN); + moea64_pvo_remove_from_page(mmu, oldpvo); + PV_UNLOCK(oldpvo->pvo_pte.pa & LPTE_RPGN); + free_pvo_entry(oldpvo); + } + if (error != ENOMEM) break; if ((flags & PMAP_ENTER_NOSLEEP) != 0) return (KERN_RESOURCE_SHORTAGE); VM_OBJECT_ASSERT_UNLOCKED(m->object); VM_WAIT; } /* * Flush the page from the instruction cache if this page is * mapped executable and cacheable. */ if (pmap != kernel_pmap && !(m->aflags & PGA_EXECUTABLE) && (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { vm_page_aflag_set(m, PGA_EXECUTABLE); moea64_syncicache(mmu, pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE); } return (KERN_SUCCESS); } static void moea64_syncicache(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_size_t sz) { /* * This is much trickier than on older systems because * we can't sync the icache on physical addresses directly * without a direct map. Instead we check a couple of cases * where the memory is already mapped in and, failing that, * use the same trick we use for page zeroing to create * a temporary mapping for this physical address. */ if (!pmap_bootstrapped) { /* * If PMAP is not bootstrapped, we are likely to be * in real mode. */ __syncicache((void *)pa, sz); } else if (pmap == kernel_pmap) { __syncicache((void *)va, sz); } else if (hw_direct_map) { __syncicache((void *)pa, sz); } else { /* Use the scratch page to set up a temp mapping */ mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 1, pa & ~ADDR_POFF); __syncicache((void *)(moea64_scratchpage_va[1] + (va & ADDR_POFF)), sz); mtx_unlock(&moea64_scratchpage_mtx); } } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void moea64_enter_object(mmu_t mmu, pmap_t pm, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { moea64_enter(mmu, pm, start + ptoa(diff), m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP, 0); m = TAILQ_NEXT(m, listq); } } void moea64_enter_quick(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot) { moea64_enter(mmu, pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP, 0); } vm_paddr_t moea64_extract(mmu_t mmu, pmap_t pm, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; PMAP_LOCK(pm); pvo = moea64_pvo_find_va(pm, va); if (pvo == NULL) pa = 0; else - pa = (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) | - (va - PVO_VADDR(pvo)); + pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(pm); + return (pa); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. */ vm_page_t moea64_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot) { struct pvo_entry *pvo; vm_page_t m; vm_paddr_t pa; m = NULL; pa = 0; PMAP_LOCK(pmap); retry: pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); - if (pvo != NULL && (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) && - ((pvo->pvo_pte.lpte.pte_lo & LPTE_PP) == LPTE_RW || - (prot & VM_PROT_WRITE) == 0)) { + if (pvo != NULL && (pvo->pvo_pte.prot & prot) == prot) { if (vm_page_pa_tryrelock(pmap, - pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN, &pa)) + pvo->pvo_pte.pa & LPTE_RPGN, &pa)) goto retry; - m = PHYS_TO_VM_PAGE(pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN); + m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); vm_page_hold(m); } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } static mmu_t installed_mmu; static void * moea64_uma_page_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) { + struct pvo_entry *pvo; + vm_offset_t va; + vm_page_t m; + int pflags, needed_lock; + /* * This entire routine is a horrible hack to avoid bothering kmem * for new KVA addresses. Because this can get called from inside * kmem allocation routines, calling kmem for a new address here * can lead to multiply locking non-recursive mutexes. */ - vm_offset_t va; - vm_page_t m; - int pflags, needed_lock; - *flags = UMA_SLAB_PRIV; needed_lock = !PMAP_LOCKED(kernel_pmap); pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED; for (;;) { m = vm_page_alloc(NULL, 0, pflags | VM_ALLOC_NOOBJ); if (m == NULL) { if (wait & M_NOWAIT) return (NULL); VM_WAIT; } else break; } va = VM_PAGE_TO_PHYS(m); - LOCK_TABLE_WR(); + pvo = alloc_pvo_entry(1 /* bootstrap */); + + pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE; + pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | LPTE_M; + if (needed_lock) PMAP_LOCK(kernel_pmap); - moea64_pvo_enter(installed_mmu, kernel_pmap, moea64_upvo_zone, - NULL, va, VM_PAGE_TO_PHYS(m), LPTE_M, PVO_WIRED | PVO_BOOTSTRAP, - 0); + init_pvo_entry(pvo, kernel_pmap, va); + pvo->pvo_vaddr |= PVO_WIRED; + moea64_pvo_enter(installed_mmu, pvo, NULL); + if (needed_lock) PMAP_UNLOCK(kernel_pmap); - UNLOCK_TABLE_WR(); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero((void *)va, PAGE_SIZE); return (void *)va; } extern int elf32_nxstack; void moea64_init(mmu_t mmu) { CTR0(KTR_PMAP, "moea64_init"); - moea64_upvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry), + moea64_pvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); - moea64_mpvo_zone = uma_zcreate("MPVO entry", sizeof(struct pvo_entry), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, - UMA_ZONE_VM | UMA_ZONE_NOFREE); if (!hw_direct_map) { installed_mmu = mmu; - uma_zone_set_allocf(moea64_upvo_zone,moea64_uma_page_alloc); - uma_zone_set_allocf(moea64_mpvo_zone,moea64_uma_page_alloc); + uma_zone_set_allocf(moea64_pvo_zone,moea64_uma_page_alloc); } #ifdef COMPAT_FREEBSD32 elf32_nxstack = 1; #endif moea64_initialized = TRUE; } boolean_t moea64_is_referenced(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_referenced: page %p is not managed", m)); - return (moea64_query_bit(mmu, m, PTE_REF)); + + return (moea64_query_bit(mmu, m, LPTE_REF)); } boolean_t moea64_is_modified(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have LPTE_CHG set. */ VM_OBJECT_ASSERT_LOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (moea64_query_bit(mmu, m, LPTE_CHG)); } boolean_t moea64_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t va) { struct pvo_entry *pvo; - boolean_t rv; + boolean_t rv = TRUE; PMAP_LOCK(pmap); pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); - rv = pvo == NULL || (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) == 0; + if (pvo != NULL) + rv = FALSE; PMAP_UNLOCK(pmap); return (rv); } void moea64_clear_modify(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("moea64_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have LPTE_CHG * set. If the object containing the page is locked and the page is * not exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; moea64_clear_bit(mmu, m, LPTE_CHG); } /* * Clear the write and modified bits in each of the given page's mappings. */ void moea64_remove_write(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; - uintptr_t pt; + int64_t refchg, ret; pmap_t pmap; - uint64_t lo = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; powerpc_sync(); - LOCK_TABLE_RD(); + PV_PAGE_LOCK(m); + refchg = 0; LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); - if ((pvo->pvo_pte.lpte.pte_lo & LPTE_PP) != LPTE_BR) { - pt = MOEA64_PVO_TO_PTE(mmu, pvo); - pvo->pvo_pte.lpte.pte_lo &= ~LPTE_PP; - pvo->pvo_pte.lpte.pte_lo |= LPTE_BR; - if (pt != -1) { - MOEA64_PTE_SYNCH(mmu, pt, &pvo->pvo_pte.lpte); - lo |= pvo->pvo_pte.lpte.pte_lo; - pvo->pvo_pte.lpte.pte_lo &= ~LPTE_CHG; - MOEA64_PTE_CHANGE(mmu, pt, - &pvo->pvo_pte.lpte, pvo->pvo_vpn); - if (pvo->pvo_pmap == kernel_pmap) - isync(); - } + if (!(pvo->pvo_vaddr & PVO_DEAD) && + (pvo->pvo_pte.prot & VM_PROT_WRITE)) { + pvo->pvo_pte.prot &= ~VM_PROT_WRITE; + ret = MOEA64_PTE_REPLACE(mmu, pvo, + MOEA64_PTE_PROT_UPDATE); + if (ret < 0) + ret = LPTE_CHG; + refchg |= ret; + if (pvo->pvo_pmap == kernel_pmap) + isync(); } - if ((lo & LPTE_CHG) != 0) - vm_page_dirty(m); PMAP_UNLOCK(pmap); } - UNLOCK_TABLE_RD(); + if ((refchg | atomic_readandclear_32(&m->md.mdpg_attrs)) & LPTE_CHG) + vm_page_dirty(m); vm_page_aflag_clear(m, PGA_WRITEABLE); + PV_PAGE_UNLOCK(m); } /* * moea64_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int moea64_ts_referenced(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_ts_referenced: page %p is not managed", m)); return (moea64_clear_bit(mmu, m, LPTE_REF)); } /* * Modify the WIMG settings of all mappings for a page. */ void moea64_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma) { struct pvo_entry *pvo; - struct pvo_head *pvo_head; - uintptr_t pt; + int64_t refchg; pmap_t pmap; uint64_t lo; if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; } - pvo_head = vm_page_to_pvoh(m); lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), ma); - LOCK_TABLE_RD(); - LIST_FOREACH(pvo, pvo_head, pvo_vlink) { + + PV_PAGE_LOCK(m); + LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); - pt = MOEA64_PVO_TO_PTE(mmu, pvo); - pvo->pvo_pte.lpte.pte_lo &= ~LPTE_WIMG; - pvo->pvo_pte.lpte.pte_lo |= lo; - if (pt != -1) { - MOEA64_PTE_CHANGE(mmu, pt, &pvo->pvo_pte.lpte, - pvo->pvo_vpn); + if (!(pvo->pvo_vaddr & PVO_DEAD)) { + pvo->pvo_pte.pa &= ~LPTE_WIMG; + pvo->pvo_pte.pa |= lo; + refchg = MOEA64_PTE_REPLACE(mmu, pvo, + MOEA64_PTE_INVALIDATE); + if (refchg < 0) + refchg = (pvo->pvo_pte.prot & VM_PROT_WRITE) ? + LPTE_CHG : 0; + if ((pvo->pvo_vaddr & PVO_MANAGED) && + (pvo->pvo_pte.prot & VM_PROT_WRITE)) { + refchg |= + atomic_readandclear_32(&m->md.mdpg_attrs); + if (refchg & LPTE_CHG) + vm_page_dirty(m); + if (refchg & LPTE_REF) + vm_page_aflag_set(m, PGA_REFERENCED); + } if (pvo->pvo_pmap == kernel_pmap) isync(); } PMAP_UNLOCK(pmap); } - UNLOCK_TABLE_RD(); m->md.mdpg_cache_attrs = ma; + PV_PAGE_UNLOCK(m); } /* * Map a wired page into kernel virtual address space. */ void moea64_kenter_attr(mmu_t mmu, vm_offset_t va, vm_offset_t pa, vm_memattr_t ma) { - uint64_t pte_lo; int error; + struct pvo_entry *pvo, *oldpvo; - pte_lo = moea64_calc_wimg(pa, ma); + pvo = alloc_pvo_entry(0); + pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; + pvo->pvo_pte.pa = (pa & ~ADDR_POFF) | moea64_calc_wimg(pa, ma); + pvo->pvo_vaddr |= PVO_WIRED; - LOCK_TABLE_WR(); PMAP_LOCK(kernel_pmap); - error = moea64_pvo_enter(mmu, kernel_pmap, moea64_upvo_zone, - NULL, va, pa, pte_lo, PVO_WIRED, 0); + oldpvo = moea64_pvo_find_va(kernel_pmap, va); + if (oldpvo != NULL) + moea64_pvo_remove_from_pmap(mmu, oldpvo); + init_pvo_entry(pvo, kernel_pmap, va); + error = moea64_pvo_enter(mmu, pvo, NULL); PMAP_UNLOCK(kernel_pmap); - UNLOCK_TABLE_WR(); + /* Free any dead pages */ + if (oldpvo != NULL) { + PV_LOCK(oldpvo->pvo_pte.pa & LPTE_RPGN); + moea64_pvo_remove_from_page(mmu, oldpvo); + PV_UNLOCK(oldpvo->pvo_pte.pa & LPTE_RPGN); + free_pvo_entry(oldpvo); + } + if (error != 0 && error != ENOENT) panic("moea64_kenter: failed to enter va %#zx pa %#zx: %d", va, pa, error); } void moea64_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa) { moea64_kenter_attr(mmu, va, pa, VM_MEMATTR_DEFAULT); } /* * Extract the physical page address associated with the given kernel virtual * address. */ vm_paddr_t moea64_kextract(mmu_t mmu, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; /* * Shortcut the direct-mapped case when applicable. We never put * anything but 1:1 mappings below VM_MIN_KERNEL_ADDRESS. */ if (va < VM_MIN_KERNEL_ADDRESS) return (va); PMAP_LOCK(kernel_pmap); pvo = moea64_pvo_find_va(kernel_pmap, va); KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR, va)); - pa = (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) | (va - PVO_VADDR(pvo)); + pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(kernel_pmap); return (pa); } /* * Remove a wired page from kernel virtual address space. */ void moea64_kremove(mmu_t mmu, vm_offset_t va) { moea64_remove(mmu, kernel_pmap, va, va + PAGE_SIZE); } /* * Map a range of physical addresses into kernel virtual address space. * * The value passed in *virt is a suggested virtual address for the mapping. * Architectures which can support a direct-mapped physical to virtual region * can return the appropriate address within that region, leaving '*virt' - * unchanged. We cannot and therefore do not; *virt is updated with the - * first usable address after the mapped region. + * unchanged. Other architectures should map the pages starting at '*virt' and + * update '*virt' with the first usable address after the mapped region. */ vm_offset_t moea64_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start, vm_paddr_t pa_end, int prot) { vm_offset_t sva, va; + if (hw_direct_map) { + /* + * Check if every page in the region is covered by the direct + * map. The direct map covers all of physical memory. Use + * moea64_calc_wimg() as a shortcut to see if the page is in + * physical memory as a way to see if the direct map covers it. + */ + for (va = pa_start; va < pa_end; va += PAGE_SIZE) + if (moea64_calc_wimg(va, VM_MEMATTR_DEFAULT) != LPTE_M) + break; + if (va == pa_end) + return (pa_start); + } sva = *virt; va = sva; + /* XXX respect prot argument */ for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE) moea64_kenter(mmu, va, pa_start); *virt = va; return (sva); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t moea64_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m) { int loops; struct pvo_entry *pvo; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_page_exists_quick: page %p is not managed", m)); loops = 0; rv = FALSE; - LOCK_TABLE_RD(); + PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { - if (pvo->pvo_pmap == pmap) { + if (!(pvo->pvo_vaddr & PVO_DEAD) && pvo->pvo_pmap == pmap) { rv = TRUE; break; } if (++loops >= 16) break; } - UNLOCK_TABLE_RD(); + PV_PAGE_UNLOCK(m); return (rv); } /* * Return the number of managed mappings to the given physical page * that are wired. */ int moea64_page_wired_mappings(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); - LOCK_TABLE_RD(); + PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) - if ((pvo->pvo_vaddr & PVO_WIRED) != 0) + if ((pvo->pvo_vaddr & (PVO_DEAD | PVO_WIRED)) == PVO_WIRED) count++; - UNLOCK_TABLE_RD(); + PV_PAGE_UNLOCK(m); return (count); } static uintptr_t moea64_vsidcontext; uintptr_t moea64_get_unique_vsid(void) { u_int entropy; register_t hash; uint32_t mask; int i; entropy = 0; __asm __volatile("mftb %0" : "=r"(entropy)); mtx_lock(&moea64_slb_mutex); for (i = 0; i < NVSIDS; i += VSID_NBPW) { u_int n; /* * Create a new value by mutiplying by a prime and adding in * entropy from the timebase register. This is to make the * VSID more random so that the PT hash function collides * less often. (Note that the prime casues gcc to do shifts * instead of a multiply.) */ moea64_vsidcontext = (moea64_vsidcontext * 0x1105) + entropy; hash = moea64_vsidcontext & (NVSIDS - 1); if (hash == 0) /* 0 is special, avoid it */ continue; n = hash >> 5; mask = 1 << (hash & (VSID_NBPW - 1)); hash = (moea64_vsidcontext & VSID_HASHMASK); if (moea64_vsid_bitmap[n] & mask) { /* collision? */ /* anything free in this bucket? */ if (moea64_vsid_bitmap[n] == 0xffffffff) { entropy = (moea64_vsidcontext >> 20); continue; } i = ffs(~moea64_vsid_bitmap[n]) - 1; mask = 1 << i; hash &= VSID_HASHMASK & ~(VSID_NBPW - 1); hash |= i; } KASSERT(!(moea64_vsid_bitmap[n] & mask), ("Allocating in-use VSID %#zx\n", hash)); moea64_vsid_bitmap[n] |= mask; mtx_unlock(&moea64_slb_mutex); return (hash); } mtx_unlock(&moea64_slb_mutex); panic("%s: out of segments",__func__); } #ifdef __powerpc64__ void moea64_pinit(mmu_t mmu, pmap_t pmap) { RB_INIT(&pmap->pmap_pvo); pmap->pm_slb_tree_root = slb_alloc_tree(); pmap->pm_slb = slb_alloc_user_cache(); pmap->pm_slb_len = 0; } #else void moea64_pinit(mmu_t mmu, pmap_t pmap) { int i; uint32_t hash; RB_INIT(&pmap->pmap_pvo); if (pmap_bootstrapped) pmap->pmap_phys = (pmap_t)moea64_kextract(mmu, (vm_offset_t)pmap); else pmap->pmap_phys = pmap; /* * Allocate some segment registers for this pmap. */ hash = moea64_get_unique_vsid(); for (i = 0; i < 16; i++) pmap->pm_sr[i] = VSID_MAKE(i, hash); KASSERT(pmap->pm_sr[0] != 0, ("moea64_pinit: pm_sr[0] = 0")); } #endif /* * Initialize the pmap associated with process 0. */ void moea64_pinit0(mmu_t mmu, pmap_t pm) { PMAP_LOCK_INIT(pm); moea64_pinit(mmu, pm); bzero(&pm->pm_stats, sizeof(pm->pm_stats)); } /* * Set the physical protection on the specified range of this map as requested. */ static void moea64_pvo_protect(mmu_t mmu, pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot) { - uintptr_t pt; - struct vm_page *pg; - uint64_t oldlo; + struct vm_page *pg; + vm_prot_t oldprot; + int32_t refchg; PMAP_LOCK_ASSERT(pm, MA_OWNED); /* - * Grab the PTE pointer before we diddle with the cached PTE - * copy. - */ - pt = MOEA64_PVO_TO_PTE(mmu, pvo); - - /* * Change the protection of the page. */ - oldlo = pvo->pvo_pte.lpte.pte_lo; - pvo->pvo_pte.lpte.pte_lo &= ~LPTE_PP; - pvo->pvo_pte.lpte.pte_lo &= ~LPTE_NOEXEC; - if ((prot & VM_PROT_EXECUTE) == 0) - pvo->pvo_pte.lpte.pte_lo |= LPTE_NOEXEC; - if (prot & VM_PROT_WRITE) - pvo->pvo_pte.lpte.pte_lo |= LPTE_BW; - else - pvo->pvo_pte.lpte.pte_lo |= LPTE_BR; + oldprot = pvo->pvo_pte.prot; + pvo->pvo_pte.prot = prot; + pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); - pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN); - /* - * If the PVO is in the page table, update that pte as well. + * If the PVO is in the page table, update mapping */ - if (pt != -1) - MOEA64_PTE_CHANGE(mmu, pt, &pvo->pvo_pte.lpte, - pvo->pvo_vpn); + refchg = MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_PROT_UPDATE); + if (refchg < 0) + refchg = (oldprot & VM_PROT_WRITE) ? LPTE_CHG : 0; + if (pm != kernel_pmap && pg != NULL && !(pg->aflags & PGA_EXECUTABLE) && - (pvo->pvo_pte.lpte.pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { + (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { if ((pg->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(pg, PGA_EXECUTABLE); moea64_syncicache(mmu, pm, PVO_VADDR(pvo), - pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN, PAGE_SIZE); + pvo->pvo_pte.pa & LPTE_RPGN, PAGE_SIZE); } /* * Update vm about the REF/CHG bits if the page is managed and we have * removed write access. */ - if ((pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED && - (oldlo & LPTE_PP) != LPTE_BR && !(prot & VM_PROT_WRITE)) { - if (pg != NULL) { - if (pvo->pvo_pte.lpte.pte_lo & LPTE_CHG) - vm_page_dirty(pg); - if (pvo->pvo_pte.lpte.pte_lo & LPTE_REF) - vm_page_aflag_set(pg, PGA_REFERENCED); - } + if (pg != NULL && (pvo->pvo_vaddr & PVO_MANAGED) && + (oldprot & VM_PROT_WRITE)) { + refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); + if (refchg & LPTE_CHG) + vm_page_dirty(pg); + if (refchg & LPTE_REF) + vm_page_aflag_set(pg, PGA_REFERENCED); } } void moea64_protect(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { struct pvo_entry *pvo, *tpvo, key; CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm, sva, eva, prot); KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap, ("moea64_protect: non current pmap")); if ((prot & VM_PROT_READ) == VM_PROT_NONE) { moea64_remove(mmu, pm, sva, eva); return; } - LOCK_TABLE_RD(); PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); moea64_pvo_protect(mmu, pm, pvo, prot); } - UNLOCK_TABLE_RD(); PMAP_UNLOCK(pm); } /* * Map a list of wired pages into kernel virtual address space. This is * intended for temporary mappings which do not need page modification or * references recorded. Existing mappings in the region are overwritten. */ void moea64_qenter(mmu_t mmu, vm_offset_t va, vm_page_t *m, int count) { while (count-- > 0) { moea64_kenter(mmu, va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } } /* * Remove page mappings from kernel virtual address space. Intended for * temporary mappings entered by moea64_qenter. */ void moea64_qremove(mmu_t mmu, vm_offset_t va, int count) { while (count-- > 0) { moea64_kremove(mmu, va); va += PAGE_SIZE; } } void moea64_release_vsid(uint64_t vsid) { int idx, mask; mtx_lock(&moea64_slb_mutex); idx = vsid & (NVSIDS-1); mask = 1 << (idx % VSID_NBPW); idx /= VSID_NBPW; KASSERT(moea64_vsid_bitmap[idx] & mask, ("Freeing unallocated VSID %#jx", vsid)); moea64_vsid_bitmap[idx] &= ~mask; mtx_unlock(&moea64_slb_mutex); } void moea64_release(mmu_t mmu, pmap_t pmap) { /* * Free segment registers' VSIDs */ #ifdef __powerpc64__ slb_free_tree(pmap); slb_free_user_cache(pmap->pm_slb); #else KASSERT(pmap->pm_sr[0] != 0, ("moea64_release: pm_sr[0] = 0")); moea64_release_vsid(VSID_TO_HASH(pmap->pm_sr[0])); #endif } /* * Remove all pages mapped by the specified pmap */ void moea64_remove_pages(mmu_t mmu, pmap_t pm) { - struct pvo_entry *pvo, *tpvo; + struct pvo_entry *pvo, *tpvo; + struct pvo_tree tofree; - LOCK_TABLE_WR(); + RB_INIT(&tofree); + PMAP_LOCK(pm); RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) { - if (!(pvo->pvo_vaddr & PVO_WIRED)) - moea64_pvo_remove(mmu, pvo); + if (pvo->pvo_vaddr & PVO_WIRED) + continue; + + /* + * For locking reasons, remove this from the page table and + * pmap, but save delinking from the vm_page for a second + * pass + */ + moea64_pvo_remove_from_pmap(mmu, pvo); + RB_INSERT(pvo_tree, &tofree, pvo); } - UNLOCK_TABLE_WR(); PMAP_UNLOCK(pm); + + RB_FOREACH_SAFE(pvo, pvo_tree, &tofree, tpvo) { + PV_LOCK(pvo->pvo_pte.pa & LPTE_RPGN); + moea64_pvo_remove_from_page(mmu, pvo); + PV_UNLOCK(pvo->pvo_pte.pa & LPTE_RPGN); + RB_REMOVE(pvo_tree, &tofree, pvo); + free_pvo_entry(pvo); + } } /* * Remove the given range of addresses from the specified map. */ void moea64_remove(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { - struct pvo_entry *pvo, *tpvo, key; + struct pvo_entry *pvo, *tpvo, key; + struct pvo_tree tofree; /* * Perform an unsynchronized read. This is, however, safe. */ if (pm->pm_stats.resident_count == 0) return; - LOCK_TABLE_WR(); - PMAP_LOCK(pm); key.pvo_vaddr = sva; + + RB_INIT(&tofree); + + PMAP_LOCK(pm); for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); - moea64_pvo_remove(mmu, pvo); + + /* + * For locking reasons, remove this from the page table and + * pmap, but save delinking from the vm_page for a second + * pass + */ + moea64_pvo_remove_from_pmap(mmu, pvo); + RB_INSERT(pvo_tree, &tofree, pvo); } - UNLOCK_TABLE_WR(); PMAP_UNLOCK(pm); + + RB_FOREACH_SAFE(pvo, pvo_tree, &tofree, tpvo) { + PV_LOCK(pvo->pvo_pte.pa & LPTE_RPGN); + moea64_pvo_remove_from_page(mmu, pvo); + PV_UNLOCK(pvo->pvo_pte.pa & LPTE_RPGN); + RB_REMOVE(pvo_tree, &tofree, pvo); + free_pvo_entry(pvo); + } } /* * Remove physical page from all pmaps in which it resides. moea64_pvo_remove() * will reflect changes in pte's back to the vm_page. */ void moea64_remove_all(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo, *next_pvo; + struct pvo_head freequeue; + int wasdead; pmap_t pmap; - LOCK_TABLE_WR(); + LIST_INIT(&freequeue); + + PV_PAGE_LOCK(m); LIST_FOREACH_SAFE(pvo, vm_page_to_pvoh(m), pvo_vlink, next_pvo) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); - moea64_pvo_remove(mmu, pvo); + wasdead = (pvo->pvo_vaddr & PVO_DEAD); + if (!wasdead) + moea64_pvo_remove_from_pmap(mmu, pvo); + moea64_pvo_remove_from_page(mmu, pvo); + if (!wasdead) + LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink); PMAP_UNLOCK(pmap); + } - UNLOCK_TABLE_WR(); - if ((m->aflags & PGA_WRITEABLE) && moea64_is_modified(mmu, m)) - vm_page_dirty(m); - vm_page_aflag_clear(m, PGA_WRITEABLE); - vm_page_aflag_clear(m, PGA_EXECUTABLE); + KASSERT(!pmap_page_is_mapped(m), ("Page still has mappings")); + KASSERT(!(m->aflags & PGA_WRITEABLE), ("Page still writable")); + PV_PAGE_UNLOCK(m); + + /* Clean up UMA allocations */ + LIST_FOREACH_SAFE(pvo, &freequeue, pvo_vlink, next_pvo) + free_pvo_entry(pvo); } /* * Allocate a physical page of memory directly from the phys_avail map. * Can only be called from moea64_bootstrap before avail start and end are * calculated. */ vm_offset_t moea64_bootstrap_alloc(vm_size_t size, u_int align) { vm_offset_t s, e; int i, j; size = round_page(size); for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (align != 0) s = (phys_avail[i] + align - 1) & ~(align - 1); else s = phys_avail[i]; e = s + size; if (s < phys_avail[i] || e > phys_avail[i + 1]) continue; if (s + size > platform_real_maxaddr()) continue; if (s == phys_avail[i]) { phys_avail[i] += size; } else if (e == phys_avail[i + 1]) { phys_avail[i + 1] -= size; } else { for (j = phys_avail_count * 2; j > i; j -= 2) { phys_avail[j] = phys_avail[j - 2]; phys_avail[j + 1] = phys_avail[j - 1]; } phys_avail[i + 3] = phys_avail[i + 1]; phys_avail[i + 1] = s; phys_avail[i + 2] = e; phys_avail_count++; } return (s); } panic("moea64_bootstrap_alloc: could not allocate memory"); } static int -moea64_pvo_enter(mmu_t mmu, pmap_t pm, uma_zone_t zone, - struct pvo_head *pvo_head, vm_offset_t va, vm_offset_t pa, - uint64_t pte_lo, int flags, int8_t psind __unused) +moea64_pvo_enter(mmu_t mmu, struct pvo_entry *pvo, struct pvo_head *pvo_head) { - struct pvo_entry *pvo; - uintptr_t pt; - uint64_t vsid; - int first; - u_int ptegidx; - int i; - int bootstrap; + int first, err; - /* - * One nasty thing that can happen here is that the UMA calls to - * allocate new PVOs need to map more memory, which calls pvo_enter(), - * which calls UMA... - * - * We break the loop by detecting recursion and allocating out of - * the bootstrap pool. - */ + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + KASSERT(moea64_pvo_find_va(pvo->pvo_pmap, PVO_VADDR(pvo)) == NULL, + ("Existing mapping for VA %#jx", (uintmax_t)PVO_VADDR(pvo))); - first = 0; - bootstrap = (flags & PVO_BOOTSTRAP); - - if (!moea64_initialized) - bootstrap = 1; - - PMAP_LOCK_ASSERT(pm, MA_OWNED); - rw_assert(&moea64_table_lock, RA_WLOCKED); - - /* - * Compute the PTE Group index. - */ - va &= ~ADDR_POFF; - vsid = va_to_vsid(pm, va); - ptegidx = va_to_pteg(vsid, va, flags & PVO_LARGE); - - /* - * Remove any existing mapping for this page. Reuse the pvo entry if - * there is a mapping. - */ moea64_pvo_enter_calls++; - LIST_FOREACH(pvo, &moea64_pvo_table[ptegidx], pvo_olink) { - if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) { - if ((pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) == pa && - (pvo->pvo_pte.lpte.pte_lo & (LPTE_NOEXEC | LPTE_PP)) - == (pte_lo & (LPTE_NOEXEC | LPTE_PP))) { - /* - * The physical page and protection are not - * changing. Instead, this may be a request - * to change the mapping's wired attribute. - */ - pt = -1; - if ((flags & PVO_WIRED) != 0 && - (pvo->pvo_vaddr & PVO_WIRED) == 0) { - pt = MOEA64_PVO_TO_PTE(mmu, pvo); - pvo->pvo_vaddr |= PVO_WIRED; - pvo->pvo_pte.lpte.pte_hi |= LPTE_WIRED; - pm->pm_stats.wired_count++; - } else if ((flags & PVO_WIRED) == 0 && - (pvo->pvo_vaddr & PVO_WIRED) != 0) { - pt = MOEA64_PVO_TO_PTE(mmu, pvo); - pvo->pvo_vaddr &= ~PVO_WIRED; - pvo->pvo_pte.lpte.pte_hi &= ~LPTE_WIRED; - pm->pm_stats.wired_count--; - } - if (!(pvo->pvo_pte.lpte.pte_hi & LPTE_VALID)) { - KASSERT(pt == -1, - ("moea64_pvo_enter: valid pt")); - /* Re-insert if spilled */ - i = MOEA64_PTE_INSERT(mmu, ptegidx, - &pvo->pvo_pte.lpte); - if (i >= 0) - PVO_PTEGIDX_SET(pvo, i); - moea64_pte_overflow--; - } else if (pt != -1) { - /* - * The PTE's wired attribute is not a - * hardware feature, so there is no - * need to invalidate any TLB entries. - */ - MOEA64_PTE_CHANGE(mmu, pt, - &pvo->pvo_pte.lpte, pvo->pvo_vpn); - } - return (0); - } - moea64_pvo_remove(mmu, pvo); - break; - } - } - /* - * If we aren't overwriting a mapping, try to allocate. - */ - if (bootstrap) { - if (moea64_bpvo_pool_index >= moea64_bpvo_pool_size) { - panic("moea64_enter: bpvo pool exhausted, %d, %d, %zd", - moea64_bpvo_pool_index, moea64_bpvo_pool_size, - moea64_bpvo_pool_size * sizeof(struct pvo_entry)); - } - pvo = &moea64_bpvo_pool[moea64_bpvo_pool_index]; - moea64_bpvo_pool_index++; - bootstrap = 1; - } else { - pvo = uma_zalloc(zone, M_NOWAIT); - } - - if (pvo == NULL) - return (ENOMEM); - - moea64_pvo_entries++; - pvo->pvo_vaddr = va; - pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT) - | (vsid << 16); - pvo->pvo_pmap = pm; - LIST_INSERT_HEAD(&moea64_pvo_table[ptegidx], pvo, pvo_olink); - pvo->pvo_vaddr &= ~ADDR_POFF; - - if (flags & PVO_WIRED) - pvo->pvo_vaddr |= PVO_WIRED; - if (pvo_head != NULL) - pvo->pvo_vaddr |= PVO_MANAGED; - if (bootstrap) - pvo->pvo_vaddr |= PVO_BOOTSTRAP; - if (flags & PVO_LARGE) - pvo->pvo_vaddr |= PVO_LARGE; - - moea64_pte_create(&pvo->pvo_pte.lpte, vsid, va, - (uint64_t)(pa) | pte_lo, flags); - - /* * Add to pmap list */ - RB_INSERT(pvo_tree, &pm->pmap_pvo, pvo); + RB_INSERT(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); /* * Remember if the list was empty and therefore will be the first * item. */ if (pvo_head != NULL) { if (LIST_FIRST(pvo_head) == NULL) first = 1; LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink); } - if (pvo->pvo_vaddr & PVO_WIRED) { - pvo->pvo_pte.lpte.pte_hi |= LPTE_WIRED; - pm->pm_stats.wired_count++; - } - pm->pm_stats.resident_count++; + if (pvo->pvo_vaddr & PVO_WIRED) + pvo->pvo_pmap->pm_stats.wired_count++; + pvo->pvo_pmap->pm_stats.resident_count++; /* - * We hope this succeeds but it isn't required. + * Insert it into the hardware page table */ - i = MOEA64_PTE_INSERT(mmu, ptegidx, &pvo->pvo_pte.lpte); - if (i >= 0) { - PVO_PTEGIDX_SET(pvo, i); - } else { + err = MOEA64_PTE_INSERT(mmu, pvo); + if (err != 0) { panic("moea64_pvo_enter: overflow"); - moea64_pte_overflow++; } - if (pm == kernel_pmap) + moea64_pvo_entries++; + + if (pvo->pvo_pmap == kernel_pmap) isync(); #ifdef __powerpc64__ /* * Make sure all our bootstrap mappings are in the SLB as soon * as virtual memory is switched on. */ if (!pmap_bootstrapped) - moea64_bootstrap_slb_prefault(va, flags & PVO_LARGE); + moea64_bootstrap_slb_prefault(PVO_VADDR(pvo), + pvo->pvo_vaddr & PVO_LARGE); #endif return (first ? ENOENT : 0); } static void -moea64_pvo_remove(mmu_t mmu, struct pvo_entry *pvo) +moea64_pvo_remove_from_pmap(mmu_t mmu, struct pvo_entry *pvo) { struct vm_page *pg; - uintptr_t pt; + int32_t refchg; + KASSERT(pvo->pvo_pmap != NULL, ("Trying to remove PVO with no pmap")); PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); - rw_assert(&moea64_table_lock, RA_WLOCKED); + KASSERT(!(pvo->pvo_vaddr & PVO_DEAD), ("Trying to remove dead PVO")); /* - * If there is an active pte entry, we need to deactivate it (and - * save the ref & cfg bits). + * If there is an active pte entry, we need to deactivate it */ - pt = MOEA64_PVO_TO_PTE(mmu, pvo); - if (pt != -1) { - MOEA64_PTE_UNSET(mmu, pt, &pvo->pvo_pte.lpte, pvo->pvo_vpn); - PVO_PTEGIDX_CLR(pvo); - } else { - moea64_pte_overflow--; + refchg = MOEA64_PTE_UNSET(mmu, pvo); + if (refchg < 0) { + /* + * If it was evicted from the page table, be pessimistic and + * dirty the page. + */ + if (pvo->pvo_pte.prot & VM_PROT_WRITE) + refchg = LPTE_CHG; + else + refchg = 0; } /* * Update our statistics. */ pvo->pvo_pmap->pm_stats.resident_count--; if (pvo->pvo_vaddr & PVO_WIRED) pvo->pvo_pmap->pm_stats.wired_count--; /* * Remove this PVO from the pmap list. */ RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); /* - * Remove this from the overflow list and return it to the pool - * if we aren't going to reuse it. + * Mark this for the next sweep */ - LIST_REMOVE(pvo, pvo_olink); + pvo->pvo_vaddr |= PVO_DEAD; + /* Send RC bits to VM */ + if ((pvo->pvo_vaddr & PVO_MANAGED) && + (pvo->pvo_pte.prot & VM_PROT_WRITE)) { + pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); + if (pg != NULL) { + refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); + if (refchg & LPTE_CHG) + vm_page_dirty(pg); + if (refchg & LPTE_REF) + vm_page_aflag_set(pg, PGA_REFERENCED); + } + } +} + +static void +moea64_pvo_remove_from_page(mmu_t mmu, struct pvo_entry *pvo) +{ + struct vm_page *pg; + + KASSERT(pvo->pvo_vaddr & PVO_DEAD, ("Trying to delink live page")); + + /* Use NULL pmaps as a sentinel for races in page deletion */ + if (pvo->pvo_pmap == NULL) + return; + pvo->pvo_pmap = NULL; + /* - * Update vm about the REF/CHG bits if the page is managed. + * Update vm about page writeability/executability if managed */ - pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN); + PV_LOCKASSERT(pvo->pvo_pte.pa & LPTE_RPGN); + pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); - if ((pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED && pg != NULL) { + if ((pvo->pvo_vaddr & PVO_MANAGED) && pg != NULL) { LIST_REMOVE(pvo, pvo_vlink); - if ((pvo->pvo_pte.lpte.pte_lo & LPTE_PP) != LPTE_BR) { - if (pvo->pvo_pte.lpte.pte_lo & LPTE_CHG) - vm_page_dirty(pg); - if (pvo->pvo_pte.lpte.pte_lo & LPTE_REF) - vm_page_aflag_set(pg, PGA_REFERENCED); - if (LIST_EMPTY(vm_page_to_pvoh(pg))) - vm_page_aflag_clear(pg, PGA_WRITEABLE); - } if (LIST_EMPTY(vm_page_to_pvoh(pg))) - vm_page_aflag_clear(pg, PGA_EXECUTABLE); + vm_page_aflag_clear(pg, PGA_WRITEABLE | PGA_EXECUTABLE); } moea64_pvo_entries--; moea64_pvo_remove_calls++; - - if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP)) - uma_zfree((pvo->pvo_vaddr & PVO_MANAGED) ? moea64_mpvo_zone : - moea64_upvo_zone, pvo); } static struct pvo_entry * moea64_pvo_find_va(pmap_t pm, vm_offset_t va) { struct pvo_entry key; + PMAP_LOCK_ASSERT(pm, MA_OWNED); + key.pvo_vaddr = va & ~ADDR_POFF; return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key)); } static boolean_t -moea64_query_bit(mmu_t mmu, vm_page_t m, u_int64_t ptebit) +moea64_query_bit(mmu_t mmu, vm_page_t m, uint64_t ptebit) { struct pvo_entry *pvo; - uintptr_t pt; + int64_t ret; + boolean_t rv; - LOCK_TABLE_RD(); - LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { - /* - * See if we saved the bit off. If so, return success. - */ - if (pvo->pvo_pte.lpte.pte_lo & ptebit) { - UNLOCK_TABLE_RD(); - return (TRUE); - } - } + /* + * See if this bit is stored in the page already. + */ + if (m->md.mdpg_attrs & ptebit) + return (TRUE); /* - * No luck, now go through the hard part of looking at the PTEs - * themselves. Sync so that any pending REF/CHG bits are flushed to - * the PTEs. + * Examine each PTE. Sync so that any pending REF/CHG bits are + * flushed to the PTEs. */ + rv = FALSE; powerpc_sync(); + PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + ret = 0; /* * See if this pvo has a valid PTE. if so, fetch the * REF/CHG bits from the valid PTE. If the appropriate * ptebit is set, return success. */ PMAP_LOCK(pvo->pvo_pmap); - pt = MOEA64_PVO_TO_PTE(mmu, pvo); - if (pt != -1) { - MOEA64_PTE_SYNCH(mmu, pt, &pvo->pvo_pte.lpte); - if (pvo->pvo_pte.lpte.pte_lo & ptebit) { - PMAP_UNLOCK(pvo->pvo_pmap); - UNLOCK_TABLE_RD(); - return (TRUE); + if (!(pvo->pvo_vaddr & PVO_DEAD)) + ret = MOEA64_PTE_SYNCH(mmu, pvo); + PMAP_UNLOCK(pvo->pvo_pmap); + + if (ret > 0) { + atomic_set_32(&m->md.mdpg_attrs, + ret & (LPTE_CHG | LPTE_REF)); + if (ret & ptebit) { + rv = TRUE; + break; } } - PMAP_UNLOCK(pvo->pvo_pmap); } + PV_PAGE_UNLOCK(m); - UNLOCK_TABLE_RD(); - return (FALSE); + return (rv); } static u_int moea64_clear_bit(mmu_t mmu, vm_page_t m, u_int64_t ptebit) { u_int count; struct pvo_entry *pvo; - uintptr_t pt; + int64_t ret; /* * Sync so that any pending REF/CHG bits are flushed to the PTEs (so - * we can reset the right ones). note that since the pvo entries and - * list heads are accessed via BAT0 and are never placed in the page - * table, we don't have to worry about further accesses setting the - * REF/CHG bits. + * we can reset the right ones). */ powerpc_sync(); /* - * For each pvo entry, clear the pvo's ptebit. If this pvo has a - * valid pte clear the ptebit from the valid pte. + * For each pvo entry, clear the pte's ptebit. */ count = 0; - LOCK_TABLE_RD(); + PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + ret = 0; + PMAP_LOCK(pvo->pvo_pmap); - pt = MOEA64_PVO_TO_PTE(mmu, pvo); - if (pt != -1) { - MOEA64_PTE_SYNCH(mmu, pt, &pvo->pvo_pte.lpte); - if (pvo->pvo_pte.lpte.pte_lo & ptebit) { - count++; - MOEA64_PTE_CLEAR(mmu, pt, &pvo->pvo_pte.lpte, - pvo->pvo_vpn, ptebit); - } - } - pvo->pvo_pte.lpte.pte_lo &= ~ptebit; + if (!(pvo->pvo_vaddr & PVO_DEAD)) + ret = MOEA64_PTE_CLEAR(mmu, pvo, ptebit); PMAP_UNLOCK(pvo->pvo_pmap); + + if (ret > 0 && (ret & ptebit)) + count++; } + atomic_clear_32(&m->md.mdpg_attrs, ptebit); + PV_PAGE_UNLOCK(m); - UNLOCK_TABLE_RD(); return (count); } boolean_t moea64_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { struct pvo_entry *pvo, key; vm_offset_t ppa; int error = 0; PMAP_LOCK(kernel_pmap); key.pvo_vaddr = ppa = pa & ~ADDR_POFF; for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key); ppa < pa + size; ppa += PAGE_SIZE, pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) { - if (pvo == NULL || - (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) != ppa) { + if (pvo == NULL || (pvo->pvo_pte.pa & LPTE_RPGN) != ppa) { error = EFAULT; break; } } PMAP_UNLOCK(kernel_pmap); return (error); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * moea64_mapdev_attr(mmu_t mmu, vm_offset_t pa, vm_size_t size, vm_memattr_t ma) { vm_offset_t va, tmpva, ppa, offset; ppa = trunc_page(pa); offset = pa & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); va = kva_alloc(size); if (!va) panic("moea64_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0;) { moea64_kenter_attr(mmu, tmpva, ppa, ma); size -= PAGE_SIZE; tmpva += PAGE_SIZE; ppa += PAGE_SIZE; } return ((void *)(va + offset)); } void * moea64_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { return moea64_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT); } void moea64_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size) { vm_offset_t base, offset; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); kva_free(base, size); } void moea64_sync_icache(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_size_t sz) { struct pvo_entry *pvo; vm_offset_t lim; vm_paddr_t pa; vm_size_t len; PMAP_LOCK(pm); while (sz > 0) { lim = round_page(va); len = MIN(lim - va, sz); pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF); - if (pvo != NULL && !(pvo->pvo_pte.lpte.pte_lo & LPTE_I)) { - pa = (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) | - (va & ADDR_POFF); + if (pvo != NULL && !(pvo->pvo_pte.pa & LPTE_I)) { + pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va & ADDR_POFF); moea64_syncicache(mmu, pm, va, pa, len); } va += len; sz -= len; } PMAP_UNLOCK(pm); } void moea64_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va) { *va = (void *)pa; } extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1]; void moea64_scan_init(mmu_t mmu) { struct pvo_entry *pvo; vm_offset_t va; int i; if (!do_minidump) { /* Initialize phys. segments for dumpsys(). */ memset(&dump_map, 0, sizeof(dump_map)); mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); for (i = 0; i < pregions_sz; i++) { dump_map[i].pa_start = pregions[i].mr_start; dump_map[i].pa_size = pregions[i].mr_size; } return; } /* Virtual segments for minidumps: */ memset(&dump_map, 0, sizeof(dump_map)); /* 1st: kernel .data and .bss. */ dump_map[0].pa_start = trunc_page((uintptr_t)_etext); - dump_map[0].pa_size = round_page((uintptr_t)_end) - dump_map[0].pa_start; + dump_map[0].pa_size = round_page((uintptr_t)_end) - + dump_map[0].pa_start; /* 2nd: msgbuf and tables (see pmap_bootstrap()). */ dump_map[1].pa_start = (vm_paddr_t)msgbufp->msg_ptr; dump_map[1].pa_size = round_page(msgbufp->msg_size); /* 3rd: kernel VM. */ va = dump_map[1].pa_start + dump_map[1].pa_size; /* Find start of next chunk (from va). */ while (va < virtual_end) { /* Don't dump the buffer cache. */ if (va >= kmi.buffer_sva && va < kmi.buffer_eva) { va = kmi.buffer_eva; continue; } pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); - if (pvo != NULL && (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID)) + if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD)) break; va += PAGE_SIZE; } if (va < virtual_end) { dump_map[2].pa_start = va; va += PAGE_SIZE; /* Find last page in chunk. */ while (va < virtual_end) { /* Don't run into the buffer cache. */ if (va == kmi.buffer_sva) break; pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); - if (pvo == NULL || - !(pvo->pvo_pte.lpte.pte_hi & LPTE_VALID)) + if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD)) break; va += PAGE_SIZE; } dump_map[2].pa_size = va - dump_map[2].pa_start; } } + Index: head/sys/powerpc/aim/mmu_oea64.h =================================================================== --- head/sys/powerpc/aim/mmu_oea64.h (revision 279251) +++ head/sys/powerpc/aim/mmu_oea64.h (revision 279252) @@ -1,78 +1,86 @@ /*- * Copyright (C) 2010 Nathan Whitehorn * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _POWERPC_AIM_MMU_OEA64_H #define _POWERPC_AIM_MMU_OEA64_H #include extern mmu_def_t oea64_mmu; /* * Helper routines */ /* Allocate physical memory for use in moea64_bootstrap. */ vm_offset_t moea64_bootstrap_alloc(vm_size_t, u_int); +/* Set an LPTE structure to match the contents of a PVO */ +void moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte); /* + * Flags + */ + +#define MOEA64_PTE_PROT_UPDATE 1 +#define MOEA64_PTE_INVALIDATE 2 + +/* * Bootstrap subroutines * * An MMU_BOOTSTRAP() implementation looks like this: * moea64_early_bootstrap(); * Allocate Page Table * moea64_mid_bootstrap(); * Add mappings for MMU resources * moea64_late_bootstrap(); */ void moea64_early_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend); void moea64_mid_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend); void moea64_late_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend); /* * Statistics */ extern u_int moea64_pte_valid; extern u_int moea64_pte_overflow; /* * State variables */ -extern struct pvo_head *moea64_pvo_table; extern int moea64_large_page_shift; extern uint64_t moea64_large_page_size; extern u_int moea64_pteg_count; extern u_int moea64_pteg_mask; #endif /* _POWERPC_AIM_MMU_OEA64_H */ Index: head/sys/powerpc/aim/moea64_if.m =================================================================== --- head/sys/powerpc/aim/moea64_if.m (revision 279251) +++ head/sys/powerpc/aim/moea64_if.m (revision 279252) @@ -1,115 +1,121 @@ #- -# Copyright (c) 2010 Nathan Whitehorn +# Copyright (c) 2010,2015 Nathan Whitehorn # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # $FreeBSD$ # #include #include #include #include #include #include #include /** * MOEA64 kobj methods for 64-bit Book-S page table * manipulation routines used, for example, by hypervisors. */ INTERFACE moea64; +CODE { + static moea64_pte_replace_t moea64_pte_replace_default; + static int64_t moea64_pte_replace_default(mmu_t mmu, + struct pvo_entry *pvo, int flags) + { + int64_t refchg; + + refchg = MOEA64_PTE_UNSET(mmu, pvo); + MOEA64_PTE_INSERT(mmu, pvo); + + return (refchg); + } +} + /** - * Copy ref/changed bits from PTE referenced by _pt_cookie to _pvo_pt. + * Return ref/changed bits from PTE referenced by _pvo if _pvo is currently in + * the page table. Returns -1 if _pvo not currently present in the page table. */ -METHOD void pte_synch { +METHOD int64_t pte_synch { mmu_t _mmu; - uintptr_t _pt_cookie; - struct lpte *_pvo_pt; + struct pvo_entry *_pvo; }; /** * Clear bits ptebit (a mask) from the low word of the PTE referenced by - * _pt_cookie. Note that _pvo_pt is for reference use only -- the bit should - * NOT be cleared there. + * _pvo. Return previous values of ref/changed bits or -1 if _pvo is not + * currently in the page table. */ -METHOD void pte_clear { +METHOD int64_t pte_clear { mmu_t _mmu; - uintptr_t _pt_cookie; - struct lpte *_pvo_pt; - uint64_t _vpn; + struct pvo_entry *_pvo; uint64_t _ptebit; }; /** - * Invalidate the PTE referenced by _pt_cookie, synchronizing its validity - * and ref/changed bits after completion. + * Invalidate the PTE referenced by _pvo, returning its ref/changed bits. + * Returns -1 if PTE not currently present in page table. */ -METHOD void pte_unset { +METHOD int64_t pte_unset { mmu_t _mmu; - uintptr_t _pt_cookie; - struct lpte *_pvo_pt; - uint64_t _vpn; + struct pvo_entry *_pvo; }; /** - * Update the PTE referenced by _pt_cookie with the values in _pvo_pt, - * making sure that the values of ref/changed bits are preserved and - * synchronized back to _pvo_pt. + * Update the reference PTE to correspond to the contents of _pvo. Has the + * same ref/changed semantics as pte_unset() (and should clear R/C bits). May + * change the PVO's location in the page table or return with it unmapped if + * PVO_WIRED is not set. By default, does unset() followed by insert(). + * + * _flags is a bitmask describing what level of page invalidation should occur: + * 0 means no invalidation is required + * MOEA64_PTE_PROT_UPDATE signifies that the page protection bits are changing + * MOEA64_PTE_INVALIDATE requires an invalidation of the same strength as + * pte_unset() followed by pte_insert() */ -METHOD void pte_change { +METHOD int64_t pte_replace { mmu_t _mmu; - uintptr_t _pt_cookie; - struct lpte *_pvo_pt; - uint64_t _vpn; -}; - + struct pvo_entry *_pvo; + int _flags; +} DEFAULT moea64_pte_replace_default; /** - * Insert the PTE _pvo_pt into the PTEG group _ptegidx, returning the index - * of the PTE in its group at completion, or -1 if no slots were free. Must - * not replace PTEs marked LPTE_WIRED or LPTE_LOCKED, and must set LPTE_HID - * and LPTE_VALID appropriately in _pvo_pt. + * Insert a PTE corresponding to _pvo into the page table, returning any errors + * encountered and (optionally) setting the PVO slot value to some + * representation of where the entry was placed. + * + * Must not replace PTEs marked LPTE_WIRED. If an existing valid PTE is spilled, + * must synchronize ref/changed bits as in pte_unset(). */ METHOD int pte_insert { mmu_t _mmu; - u_int _ptegidx; - struct lpte *_pvo_pt; + struct pvo_entry *_pvo; }; - -/** - * Return the page table reference cookie corresponding to _pvo, or -1 if - * the _pvo is not currently in the page table. - */ -METHOD uintptr_t pvo_to_pte { - mmu_t _mmu; - const struct pvo_entry *_pvo; -}; - Index: head/sys/powerpc/aim/moea64_native.c =================================================================== --- head/sys/powerpc/aim/moea64_native.c (revision 279251) +++ head/sys/powerpc/aim/moea64_native.c (revision 279252) @@ -1,626 +1,658 @@ /*- * Copyright (c) 2001 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Matt Thomas of Allegro Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: pmap.c,v 1.28 2000/03/26 20:42:36 kleink Exp $ */ /*- * Copyright (C) 2001 Benno Rice. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Native 64-bit page table operations for running without a hypervisor. */ #include #include #include #include #include #include #include #include #include +#include +#include #include #include #include #include #include #include #include #include #include #include #include #include "mmu_oea64.h" #include "mmu_if.h" #include "moea64_if.h" #define PTESYNC() __asm __volatile("ptesync"); #define TLBSYNC() __asm __volatile("tlbsync; ptesync"); #define SYNC() __asm __volatile("sync"); #define EIEIO() __asm __volatile("eieio"); #define VSID_HASH_MASK 0x0000007fffffffffULL static __inline void TLBIE(uint64_t vpn) { #ifndef __powerpc64__ register_t vpn_hi, vpn_lo; register_t msr; register_t scratch, intr; #endif static volatile u_int tlbie_lock = 0; vpn <<= ADDR_PIDX_SHFT; vpn &= ~(0xffffULL << 48); /* Hobo spinlock: we need stronger guarantees than mutexes provide */ while (!atomic_cmpset_int(&tlbie_lock, 0, 1)); isync(); /* Flush instruction queue once lock acquired */ #ifdef __powerpc64__ __asm __volatile("tlbie %0" :: "r"(vpn) : "memory"); __asm __volatile("eieio; tlbsync; ptesync" ::: "memory"); #else vpn_hi = (uint32_t)(vpn >> 32); vpn_lo = (uint32_t)vpn; intr = intr_disable(); __asm __volatile("\ mfmsr %0; \ mr %1, %0; \ insrdi %1,%5,1,0; \ mtmsrd %1; isync; \ \ sld %1,%2,%4; \ or %1,%1,%3; \ tlbie %1; \ \ mtmsrd %0; isync; \ eieio; \ tlbsync; \ ptesync;" : "=r"(msr), "=r"(scratch) : "r"(vpn_hi), "r"(vpn_lo), "r"(32), "r"(1) : "memory"); intr_restore(intr); #endif /* No barriers or special ops -- taken care of by ptesync above */ tlbie_lock = 0; } #define DISABLE_TRANS(msr) msr = mfmsr(); mtmsr(msr & ~PSL_DR) #define ENABLE_TRANS(msr) mtmsr(msr) /* * PTEG data. */ -static struct lpteg *moea64_pteg_table; +static volatile struct lpte *moea64_pteg_table; +static struct rwlock moea64_eviction_lock; /* * PTE calls. */ -static int moea64_pte_insert_native(mmu_t, u_int, struct lpte *); -static uintptr_t moea64_pvo_to_pte_native(mmu_t, const struct pvo_entry *); -static void moea64_pte_synch_native(mmu_t, uintptr_t pt, - struct lpte *pvo_pt); -static void moea64_pte_clear_native(mmu_t, uintptr_t pt, - struct lpte *pvo_pt, uint64_t vpn, uint64_t ptebit); -static void moea64_pte_change_native(mmu_t, uintptr_t pt, - struct lpte *pvo_pt, uint64_t vpn); -static void moea64_pte_unset_native(mmu_t mmu, uintptr_t pt, - struct lpte *pvo_pt, uint64_t vpn); +static int moea64_pte_insert_native(mmu_t, struct pvo_entry *); +static int64_t moea64_pte_synch_native(mmu_t, struct pvo_entry *); +static int64_t moea64_pte_clear_native(mmu_t, struct pvo_entry *, uint64_t); +static int64_t moea64_pte_replace_native(mmu_t, struct pvo_entry *, int); +static int64_t moea64_pte_unset_native(mmu_t mmu, struct pvo_entry *); /* * Utility routines. */ -static void moea64_bootstrap_native(mmu_t mmup, - vm_offset_t kernelstart, vm_offset_t kernelend); -static void moea64_cpu_bootstrap_native(mmu_t, int ap); -static void tlbia(void); +static void moea64_bootstrap_native(mmu_t mmup, + vm_offset_t kernelstart, vm_offset_t kernelend); +static void moea64_cpu_bootstrap_native(mmu_t, int ap); +static void tlbia(void); static mmu_method_t moea64_native_methods[] = { /* Internal interfaces */ MMUMETHOD(mmu_bootstrap, moea64_bootstrap_native), MMUMETHOD(mmu_cpu_bootstrap, moea64_cpu_bootstrap_native), MMUMETHOD(moea64_pte_synch, moea64_pte_synch_native), MMUMETHOD(moea64_pte_clear, moea64_pte_clear_native), MMUMETHOD(moea64_pte_unset, moea64_pte_unset_native), - MMUMETHOD(moea64_pte_change, moea64_pte_change_native), + MMUMETHOD(moea64_pte_replace, moea64_pte_replace_native), MMUMETHOD(moea64_pte_insert, moea64_pte_insert_native), - MMUMETHOD(moea64_pvo_to_pte, moea64_pvo_to_pte_native), { 0, 0 } }; MMU_DEF_INHERIT(oea64_mmu_native, MMU_TYPE_G5, moea64_native_methods, 0, oea64_mmu); -static __inline u_int -va_to_pteg(uint64_t vsid, vm_offset_t addr, int large) +static int64_t +moea64_pte_synch_native(mmu_t mmu, struct pvo_entry *pvo) { - uint64_t hash; - int shift; + volatile struct lpte *pt = moea64_pteg_table + pvo->pvo_pte.slot; + struct lpte properpt; + uint64_t ptelo; - shift = large ? moea64_large_page_shift : ADDR_PIDX_SHFT; - hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)addr & ADDR_PIDX) >> - shift); - return (hash & moea64_pteg_mask); -} + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); -static void -moea64_pte_synch_native(mmu_t mmu, uintptr_t pt_cookie, struct lpte *pvo_pt) -{ - struct lpte *pt = (struct lpte *)pt_cookie; + moea64_pte_from_pvo(pvo, &properpt); - pvo_pt->pte_lo |= pt->pte_lo & (LPTE_REF | LPTE_CHG); + rw_rlock(&moea64_eviction_lock); + if ((pt->pte_hi & LPTE_AVPN_MASK) != + (properpt.pte_hi & LPTE_AVPN_MASK)) { + /* Evicted */ + rw_runlock(&moea64_eviction_lock); + return (-1); + } + + PTESYNC(); + ptelo = be64toh(pt->pte_lo); + + rw_runlock(&moea64_eviction_lock); + + return (ptelo & (LPTE_REF | LPTE_CHG)); } -static void -moea64_pte_clear_native(mmu_t mmu, uintptr_t pt_cookie, struct lpte *pvo_pt, - uint64_t vpn, uint64_t ptebit) +static int64_t +moea64_pte_clear_native(mmu_t mmu, struct pvo_entry *pvo, uint64_t ptebit) { - struct lpte *pt = (struct lpte *)pt_cookie; + volatile struct lpte *pt = moea64_pteg_table + pvo->pvo_pte.slot; + struct lpte properpt; + uint64_t ptelo; - /* - * As shown in Section 7.6.3.2.3 - */ - pt->pte_lo &= ~ptebit; - critical_enter(); - TLBIE(vpn); - critical_exit(); -} + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); -static void -moea64_pte_set_native(struct lpte *pt, struct lpte *pvo_pt) -{ + moea64_pte_from_pvo(pvo, &properpt); - pvo_pt->pte_hi |= LPTE_VALID; + rw_rlock(&moea64_eviction_lock); + if ((pt->pte_hi & LPTE_AVPN_MASK) != + (properpt.pte_hi & LPTE_AVPN_MASK)) { + /* Evicted */ + rw_runlock(&moea64_eviction_lock); + return (-1); + } - /* - * Update the PTE as defined in section 7.6.3.1. - * Note that the REF/CHG bits are from pvo_pt and thus should have - * been saved so this routine can restore them (if desired). - */ - pt->pte_lo = pvo_pt->pte_lo; - EIEIO(); - pt->pte_hi = pvo_pt->pte_hi; - PTESYNC(); + if (ptebit == LPTE_REF) { + /* See "Resetting the Reference Bit" in arch manual */ + PTESYNC(); + /* 2-step here safe: precision is not guaranteed */ + ptelo |= pt->pte_lo; - /* Keep statistics for unlocked pages */ - if (!(pvo_pt->pte_hi & LPTE_LOCKED)) - moea64_pte_valid++; + /* One-byte store to avoid touching the C bit */ + ((volatile uint8_t *)(&pt->pte_lo))[6] = + ((uint8_t *)(&properpt.pte_lo))[6]; + rw_runlock(&moea64_eviction_lock); + + critical_enter(); + TLBIE(pvo->pvo_vpn); + critical_exit(); + } else { + rw_runlock(&moea64_eviction_lock); + ptelo = moea64_pte_unset_native(mmu, pvo); + moea64_pte_insert_native(mmu, pvo); + } + + return (ptelo & (LPTE_REF | LPTE_CHG)); } -static void -moea64_pte_unset_native(mmu_t mmu, uintptr_t pt_cookie, struct lpte *pvo_pt, - uint64_t vpn) +static int64_t +moea64_pte_unset_native(mmu_t mmu, struct pvo_entry *pvo) { - struct lpte *pt = (struct lpte *)pt_cookie; + volatile struct lpte *pt = moea64_pteg_table + pvo->pvo_pte.slot; + struct lpte properpt; + uint64_t ptelo; + moea64_pte_from_pvo(pvo, &properpt); + + rw_rlock(&moea64_eviction_lock); + if ((pt->pte_hi & LPTE_AVPN_MASK) != + (properpt.pte_hi & LPTE_AVPN_MASK)) { + /* Evicted */ + moea64_pte_overflow--; + rw_runlock(&moea64_eviction_lock); + return (-1); + } + /* - * Invalidate the pte. + * Invalidate the pte, briefly locking it to collect RC bits. No + * atomics needed since this is protected against eviction by the lock. */ isync(); critical_enter(); - pvo_pt->pte_hi &= ~LPTE_VALID; - pt->pte_hi &= ~LPTE_VALID; + pt->pte_hi = (pt->pte_hi & ~LPTE_VALID) | LPTE_LOCKED; PTESYNC(); - TLBIE(vpn); + TLBIE(pvo->pvo_vpn); + ptelo = be64toh(pt->pte_lo); + *((volatile int32_t *)(&pt->pte_hi) + 1) = 0; /* Release lock */ critical_exit(); + rw_runlock(&moea64_eviction_lock); - /* - * Save the reg & chg bits. - */ - moea64_pte_synch_native(mmu, pt_cookie, pvo_pt); + /* Keep statistics */ + moea64_pte_valid--; - /* Keep statistics for unlocked pages */ - if (!(pvo_pt->pte_hi & LPTE_LOCKED)) - moea64_pte_valid--; + return (ptelo & (LPTE_CHG | LPTE_REF)); } -static void -moea64_pte_change_native(mmu_t mmu, uintptr_t pt, struct lpte *pvo_pt, - uint64_t vpn) +static int64_t +moea64_pte_replace_native(mmu_t mmu, struct pvo_entry *pvo, int flags) { + volatile struct lpte *pt = moea64_pteg_table + pvo->pvo_pte.slot; + struct lpte properpt; + int64_t ptelo; - /* - * Invalidate the PTE - */ - moea64_pte_unset_native(mmu, pt, pvo_pt, vpn); - moea64_pte_set_native((struct lpte *)pt, pvo_pt); + if (flags == 0) { + /* Just some software bits changing. */ + moea64_pte_from_pvo(pvo, &properpt); + + rw_rlock(&moea64_eviction_lock); + if ((pt->pte_hi & LPTE_AVPN_MASK) != + (properpt.pte_hi & LPTE_AVPN_MASK)) { + rw_runlock(&moea64_eviction_lock); + return (-1); + } + pt->pte_hi = properpt.pte_hi; + ptelo = pt->pte_lo; + rw_runlock(&moea64_eviction_lock); + } else { + /* Otherwise, need reinsertion and deletion */ + ptelo = moea64_pte_unset_native(mmu, pvo); + moea64_pte_insert_native(mmu, pvo); + } + + return (ptelo); } static void moea64_cpu_bootstrap_native(mmu_t mmup, int ap) { int i = 0; #ifdef __powerpc64__ struct slb *slb = PCPU_GET(slb); register_t seg0; #endif /* * Initialize segment registers and MMU */ mtmsr(mfmsr() & ~PSL_DR & ~PSL_IR); /* * Install kernel SLB entries */ #ifdef __powerpc64__ __asm __volatile ("slbia"); __asm __volatile ("slbmfee %0,%1; slbie %0;" : "=r"(seg0) : "r"(0)); for (i = 0; i < 64; i++) { if (!(slb[i].slbe & SLBE_VALID)) continue; __asm __volatile ("slbmte %0, %1" :: "r"(slb[i].slbv), "r"(slb[i].slbe)); } #else for (i = 0; i < 16; i++) mtsrin(i << ADDR_SR_SHFT, kernel_pmap->pm_sr[i]); #endif /* * Install page table */ __asm __volatile ("ptesync; mtsdr1 %0; isync" :: "r"((uintptr_t)moea64_pteg_table | (uintptr_t)(flsl(moea64_pteg_mask >> 11)))); tlbia(); } static void moea64_bootstrap_native(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { vm_size_t size; vm_offset_t off; vm_paddr_t pa; register_t msr; moea64_early_bootstrap(mmup, kernelstart, kernelend); /* * Allocate PTEG table. */ size = moea64_pteg_count * sizeof(struct lpteg); CTR2(KTR_PMAP, "moea64_bootstrap: %d PTEGs, %d bytes", moea64_pteg_count, size); + rw_init(&moea64_eviction_lock, "pte eviction"); /* * We now need to allocate memory. This memory, to be allocated, * has to reside in a page table. The page table we are about to * allocate. We don't have BAT. So drop to data real mode for a minute * as a measure of last resort. We do this a couple times. */ - moea64_pteg_table = (struct lpteg *)moea64_bootstrap_alloc(size, size); + moea64_pteg_table = (struct lpte *)moea64_bootstrap_alloc(size, size); DISABLE_TRANS(msr); - bzero((void *)moea64_pteg_table, moea64_pteg_count * sizeof(struct lpteg)); + bzero(__DEVOLATILE(void *, moea64_pteg_table), moea64_pteg_count * + sizeof(struct lpteg)); ENABLE_TRANS(msr); CTR1(KTR_PMAP, "moea64_bootstrap: PTEG table at %p", moea64_pteg_table); moea64_mid_bootstrap(mmup, kernelstart, kernelend); /* * Add a mapping for the page table itself if there is no direct map. */ if (!hw_direct_map) { size = moea64_pteg_count * sizeof(struct lpteg); off = (vm_offset_t)(moea64_pteg_table); DISABLE_TRANS(msr); for (pa = off; pa < off + size; pa += PAGE_SIZE) pmap_kenter(pa, pa); ENABLE_TRANS(msr); } /* Bring up virtual memory */ moea64_late_bootstrap(mmup, kernelstart, kernelend); } static void tlbia(void) { vm_offset_t i; #ifndef __powerpc64__ register_t msr, scratch; #endif TLBSYNC(); for (i = 0; i < 0xFF000; i += 0x00001000) { #ifdef __powerpc64__ __asm __volatile("tlbiel %0" :: "r"(i)); #else __asm __volatile("\ mfmsr %0; \ mr %1, %0; \ insrdi %1,%3,1,0; \ mtmsrd %1; \ isync; \ \ tlbiel %2; \ \ mtmsrd %0; \ isync;" : "=r"(msr), "=r"(scratch) : "r"(i), "r"(1)); #endif } EIEIO(); TLBSYNC(); } -static uintptr_t -moea64_pvo_to_pte_native(mmu_t mmu, const struct pvo_entry *pvo) +static int +atomic_pte_lock(volatile struct lpte *pte, uint64_t bitmask, uint64_t *oldhi) { - struct lpte *pt; - int pteidx, ptegidx; - uint64_t vsid; + int ret; + uint32_t oldhihalf; - /* If the PTEG index is not set, then there is no page table entry */ - if (!PVO_PTEGIDX_ISSET(pvo)) - return (-1); - /* - * Calculate the ptegidx + * Note: in principle, if just the locked bit were set here, we + * could avoid needing the eviction lock. However, eviction occurs + * so rarely that it isn't worth bothering about in practice. */ - vsid = PVO_VSID(pvo); - ptegidx = va_to_pteg(vsid, PVO_VADDR(pvo), - pvo->pvo_vaddr & PVO_LARGE); - /* - * We can find the actual pte entry without searching by grabbing - * the PTEG index from 3 unused bits in pvo_vaddr and by - * noticing the HID bit. - */ - if (pvo->pvo_pte.lpte.pte_hi & LPTE_HID) - ptegidx ^= moea64_pteg_mask; + __asm __volatile ( + "1:\tlwarx %1, 0, %3\n\t" /* load old value */ + "and. %0,%1,%4\n\t" /* check if any bits set */ + "bne 2f\n\t" /* exit if any set */ + "stwcx. %5, 0, %3\n\t" /* attempt to store */ + "bne- 1b\n\t" /* spin if failed */ + "li %0, 1\n\t" /* success - retval = 1 */ + "b 3f\n\t" /* we've succeeded */ + "2:\n\t" + "stwcx. %1, 0, %3\n\t" /* clear reservation (74xx) */ + "li %0, 0\n\t" /* failure - retval = 0 */ + "3:\n\t" + : "=&r" (ret), "=&r"(oldhihalf), "=m" (pte->pte_hi) + : "r" ((volatile char *)&pte->pte_hi + 4), + "r" ((uint32_t)bitmask), "r" ((uint32_t)LPTE_LOCKED), + "m" (pte->pte_hi) + : "cr0", "cr1", "cr2", "memory"); - pteidx = (ptegidx << 3) | PVO_PTEGIDX_GET(pvo); + *oldhi = (pte->pte_hi & 0xffffffff00000000ULL) | oldhihalf; - if ((pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) && - !PVO_PTEGIDX_ISSET(pvo)) { - panic("moea64_pvo_to_pte: pvo %p has valid pte in pvo but no " - "valid pte index", pvo); - } - - if ((pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) == 0 && - PVO_PTEGIDX_ISSET(pvo)) { - panic("moea64_pvo_to_pte: pvo %p has valid pte index in pvo " - "pvo but no valid pte", pvo); - } - - pt = &moea64_pteg_table[pteidx >> 3].pt[pteidx & 7]; - if ((pt->pte_hi ^ (pvo->pvo_pte.lpte.pte_hi & ~LPTE_VALID)) == - LPTE_VALID) { - if ((pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) == 0) { - panic("moea64_pvo_to_pte: pvo %p has valid pte in " - "moea64_pteg_table %p but invalid in pvo", pvo, pt); - } - - if (((pt->pte_lo ^ pvo->pvo_pte.lpte.pte_lo) & - ~(LPTE_M|LPTE_CHG|LPTE_REF)) != 0) { - panic("moea64_pvo_to_pte: pvo %p pte does not match " - "pte %p in moea64_pteg_table difference is %#x", - pvo, pt, - (uint32_t)(pt->pte_lo ^ pvo->pvo_pte.lpte.pte_lo)); - } - - return ((uintptr_t)pt); - } - - if (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) { - panic("moea64_pvo_to_pte: pvo %p has invalid pte %p in " - "moea64_pteg_table but valid in pvo", pvo, pt); - } - - return (-1); + return (ret); } -static __inline int -moea64_pte_spillable_ident(u_int ptegidx) +static uintptr_t +moea64_insert_to_pteg_native(struct lpte *pvo_pt, uintptr_t slotbase, + uint64_t mask) { - struct lpte *pt; - int i, j, k; + volatile struct lpte *pt; + uint64_t oldptehi, va; + uintptr_t k; + int i, j; /* Start at a random slot */ i = mftb() % 8; - k = -1; for (j = 0; j < 8; j++) { - pt = &moea64_pteg_table[ptegidx].pt[(i + j) % 8]; - if (pt->pte_hi & (LPTE_LOCKED | LPTE_WIRED)) - continue; + k = slotbase + (i + j) % 8; + pt = &moea64_pteg_table[k]; + /* Invalidate and seize lock only if no bits in mask set */ + if (atomic_pte_lock(pt, mask, &oldptehi)) /* Lock obtained */ + break; + } - /* This is a candidate, so remember it */ - k = (i + j) % 8; + if (j == 8) + return (-1); - /* Try to get a page that has not been used lately */ - if (!(pt->pte_lo & LPTE_REF)) - return (k); + if (oldptehi & LPTE_VALID) { + KASSERT(!(oldptehi & LPTE_WIRED), ("Unmapped wired entry")); + /* + * Need to invalidate old entry completely: see + * "Modifying a Page Table Entry". Need to reconstruct + * the virtual address for the outgoing entry to do that. + */ + if (oldptehi & LPTE_BIG) + va = oldptehi >> moea64_large_page_shift; + else + va = oldptehi >> ADDR_PIDX_SHFT; + if (oldptehi & LPTE_HID) + va = (((k >> 3) ^ moea64_pteg_mask) ^ va) & + VSID_HASH_MASK; + else + va = ((k >> 3) ^ va) & VSID_HASH_MASK; + va |= (oldptehi & LPTE_AVPN_MASK) << + (ADDR_API_SHFT64 - ADDR_PIDX_SHFT); + PTESYNC(); + TLBIE(va); + moea64_pte_valid--; + moea64_pte_overflow++; } - + + /* + * Update the PTE as per "Adding a Page Table Entry". Lock is released + * by setting the high doubleworld. + */ + pt->pte_lo = pvo_pt->pte_lo; + EIEIO(); + pt->pte_hi = pvo_pt->pte_hi; + PTESYNC(); + + /* Keep statistics */ + moea64_pte_valid++; + return (k); } static int -moea64_pte_insert_native(mmu_t mmu, u_int ptegidx, struct lpte *pvo_pt) +moea64_pte_insert_native(mmu_t mmu, struct pvo_entry *pvo) { - struct lpte *pt; - struct pvo_entry *pvo; - u_int pteg_bktidx; - int i; + struct lpte insertpt; + uintptr_t slot; + /* Initialize PTE */ + moea64_pte_from_pvo(pvo, &insertpt); + + /* Make sure further insertion is locked out during evictions */ + rw_rlock(&moea64_eviction_lock); + /* * First try primary hash. */ - pteg_bktidx = ptegidx; - for (pt = moea64_pteg_table[pteg_bktidx].pt, i = 0; i < 8; i++, pt++) { - if ((pt->pte_hi & (LPTE_VALID | LPTE_LOCKED)) == 0) { - pvo_pt->pte_hi &= ~LPTE_HID; - moea64_pte_set_native(pt, pvo_pt); - return (i); - } + pvo->pvo_pte.slot &= ~7ULL; /* Base slot address */ + slot = moea64_insert_to_pteg_native(&insertpt, pvo->pvo_pte.slot, + LPTE_VALID | LPTE_WIRED | LPTE_LOCKED); + if (slot != -1) { + rw_runlock(&moea64_eviction_lock); + pvo->pvo_pte.slot = slot; + return (0); } /* * Now try secondary hash. */ - pteg_bktidx ^= moea64_pteg_mask; - for (pt = moea64_pteg_table[pteg_bktidx].pt, i = 0; i < 8; i++, pt++) { - if ((pt->pte_hi & (LPTE_VALID | LPTE_LOCKED)) == 0) { - pvo_pt->pte_hi |= LPTE_HID; - moea64_pte_set_native(pt, pvo_pt); - return (i); - } + pvo->pvo_vaddr ^= PVO_HID; + insertpt.pte_hi ^= LPTE_HID; + pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); + slot = moea64_insert_to_pteg_native(&insertpt, pvo->pvo_pte.slot, + LPTE_VALID | LPTE_WIRED | LPTE_LOCKED); + if (slot != -1) { + rw_runlock(&moea64_eviction_lock); + pvo->pvo_pte.slot = slot; + return (0); } /* * Out of luck. Find a PTE to sacrifice. */ - pteg_bktidx = ptegidx; - i = moea64_pte_spillable_ident(pteg_bktidx); - if (i < 0) { - pteg_bktidx ^= moea64_pteg_mask; - i = moea64_pte_spillable_ident(pteg_bktidx); + + /* Lock out all insertions for a bit */ + if (!rw_try_upgrade(&moea64_eviction_lock)) { + rw_runlock(&moea64_eviction_lock); + rw_wlock(&moea64_eviction_lock); } - if (i < 0) { - /* No freeable slots in either PTEG? We're hosed. */ - panic("moea64_pte_insert: overflow"); - return (-1); + slot = moea64_insert_to_pteg_native(&insertpt, pvo->pvo_pte.slot, + LPTE_WIRED | LPTE_LOCKED); + if (slot != -1) { + rw_wunlock(&moea64_eviction_lock); + pvo->pvo_pte.slot = slot; + return (0); } - if (pteg_bktidx == ptegidx) - pvo_pt->pte_hi &= ~LPTE_HID; - else - pvo_pt->pte_hi |= LPTE_HID; - - /* - * Synchronize the sacrifice PTE with its PVO, then mark both - * invalid. The PVO will be reused when/if the VM system comes - * here after a fault. - */ - pt = &moea64_pteg_table[pteg_bktidx].pt[i]; - - if (pt->pte_hi & LPTE_HID) - pteg_bktidx ^= moea64_pteg_mask; /* PTEs indexed by primary */ - - LIST_FOREACH(pvo, &moea64_pvo_table[pteg_bktidx], pvo_olink) { - if (pvo->pvo_pte.lpte.pte_hi == pt->pte_hi) { - KASSERT(pvo->pvo_pte.lpte.pte_hi & LPTE_VALID, - ("Invalid PVO for valid PTE!")); - moea64_pte_unset_native(mmu, (uintptr_t)pt, - &pvo->pvo_pte.lpte, pvo->pvo_vpn); - PVO_PTEGIDX_CLR(pvo); - moea64_pte_overflow++; - break; - } + /* Try other hash table. Now we're getting desperate... */ + pvo->pvo_vaddr ^= PVO_HID; + insertpt.pte_hi ^= LPTE_HID; + pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); + slot = moea64_insert_to_pteg_native(&insertpt, pvo->pvo_pte.slot, + LPTE_WIRED | LPTE_LOCKED); + if (slot != -1) { + rw_wunlock(&moea64_eviction_lock); + pvo->pvo_pte.slot = slot; + return (0); } - KASSERT(pvo->pvo_pte.lpte.pte_hi == pt->pte_hi, - ("Unable to find PVO for spilled PTE")); - - /* - * Set the new PTE. - */ - moea64_pte_set_native(pt, pvo_pt); - - return (i); + /* No freeable slots in either PTEG? We're hosed. */ + rw_wunlock(&moea64_eviction_lock); + panic("moea64_pte_insert: overflow"); + return (-1); } Index: head/sys/powerpc/include/pmap.h =================================================================== --- head/sys/powerpc/include/pmap.h (revision 279251) +++ head/sys/powerpc/include/pmap.h (revision 279252) @@ -1,252 +1,265 @@ /*- * Copyright (C) 2006 Semihalf, Marian Balakowicz * All rights reserved. * * Adapted for Freescale's e500 core CPUs. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /*- * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: $NetBSD: pmap.h,v 1.17 2000/03/30 16:18:24 jdolecek Exp $ */ #ifndef _MACHINE_PMAP_H_ #define _MACHINE_PMAP_H_ #include #include #include #include #include #include #include #include #include #if defined(AIM) #if !defined(NPMAPS) #define NPMAPS 32768 #endif /* !defined(NPMAPS) */ struct slbtnode; struct pmap; typedef struct pmap *pmap_t; struct pvo_entry { LIST_ENTRY(pvo_entry) pvo_vlink; /* Link to common virt page */ +#ifndef __powerpc64__ LIST_ENTRY(pvo_entry) pvo_olink; /* Link to overflow entry */ +#endif RB_ENTRY(pvo_entry) pvo_plink; /* Link to pmap entries */ - union { - struct pte pte; /* 32 bit PTE */ - struct lpte lpte; /* 64 bit PTE */ + struct { +#ifndef __powerpc64__ + /* 32-bit fields */ + struct pte pte; +#endif + /* 64-bit fields */ + uintptr_t slot; + vm_paddr_t pa; + vm_prot_t prot; } pvo_pte; pmap_t pvo_pmap; /* Owning pmap */ vm_offset_t pvo_vaddr; /* VA of entry */ uint64_t pvo_vpn; /* Virtual page number */ }; LIST_HEAD(pvo_head, pvo_entry); RB_HEAD(pvo_tree, pvo_entry); int pvo_vaddr_compare(struct pvo_entry *, struct pvo_entry *); RB_PROTOTYPE(pvo_tree, pvo_entry, pvo_plink, pvo_vaddr_compare); +/* Used by 32-bit PMAP */ #define PVO_PTEGIDX_MASK 0x007UL /* which PTEG slot */ #define PVO_PTEGIDX_VALID 0x008UL /* slot is valid */ +/* Used by 64-bit PMAP */ +#define PVO_HID 0x008UL /* PVO entry in alternate hash*/ +/* Used by both */ #define PVO_WIRED 0x010UL /* PVO entry is wired */ #define PVO_MANAGED 0x020UL /* PVO entry is managed */ #define PVO_BOOTSTRAP 0x080UL /* PVO entry allocated during bootstrap */ +#define PVO_DEAD 0x100UL /* waiting to be deleted */ #define PVO_LARGE 0x200UL /* large page */ #define PVO_VADDR(pvo) ((pvo)->pvo_vaddr & ~ADDR_POFF) #define PVO_PTEGIDX_GET(pvo) ((pvo)->pvo_vaddr & PVO_PTEGIDX_MASK) #define PVO_PTEGIDX_ISSET(pvo) ((pvo)->pvo_vaddr & PVO_PTEGIDX_VALID) #define PVO_PTEGIDX_CLR(pvo) \ ((void)((pvo)->pvo_vaddr &= ~(PVO_PTEGIDX_VALID|PVO_PTEGIDX_MASK))) #define PVO_PTEGIDX_SET(pvo, i) \ ((void)((pvo)->pvo_vaddr |= (i)|PVO_PTEGIDX_VALID)) #define PVO_VSID(pvo) ((pvo)->pvo_vpn >> 16) struct pmap { struct mtx pm_mtx; #ifdef __powerpc64__ struct slbtnode *pm_slb_tree_root; struct slb **pm_slb; int pm_slb_len; #else register_t pm_sr[16]; #endif cpuset_t pm_active; struct pmap *pmap_phys; struct pmap_statistics pm_stats; struct pvo_tree pmap_pvo; }; struct md_page { - u_int64_t mdpg_attrs; + volatile int32_t mdpg_attrs; vm_memattr_t mdpg_cache_attrs; struct pvo_head mdpg_pvoh; }; #define pmap_page_get_memattr(m) ((m)->md.mdpg_cache_attrs) #define pmap_page_is_mapped(m) (!LIST_EMPTY(&(m)->md.mdpg_pvoh)) /* * Return the VSID corresponding to a given virtual address. * If no VSID is currently defined, it will allocate one, and add * it to a free slot if available. * * NB: The PMAP MUST be locked already. */ uint64_t va_to_vsid(pmap_t pm, vm_offset_t va); /* Lock-free, non-allocating lookup routines */ uint64_t kernel_va_to_slbv(vm_offset_t va); struct slb *user_va_to_slb_entry(pmap_t pm, vm_offset_t va); uint64_t allocate_user_vsid(pmap_t pm, uint64_t esid, int large); void free_vsid(pmap_t pm, uint64_t esid, int large); void slb_insert_user(pmap_t pm, struct slb *slb); void slb_insert_kernel(uint64_t slbe, uint64_t slbv); struct slbtnode *slb_alloc_tree(void); void slb_free_tree(pmap_t pm); struct slb **slb_alloc_user_cache(void); void slb_free_user_cache(struct slb **); #else struct pmap { struct mtx pm_mtx; /* pmap mutex */ tlbtid_t pm_tid[MAXCPU]; /* TID to identify this pmap entries in TLB */ cpuset_t pm_active; /* active on cpus */ struct pmap_statistics pm_stats; /* pmap statistics */ /* Page table directory, array of pointers to page tables. */ pte_t *pm_pdir[PDIR_NENTRIES]; /* List of allocated ptbl bufs (ptbl kva regions). */ TAILQ_HEAD(, ptbl_buf) pm_ptbl_list; }; typedef struct pmap *pmap_t; struct pv_entry { pmap_t pv_pmap; vm_offset_t pv_va; TAILQ_ENTRY(pv_entry) pv_link; }; typedef struct pv_entry *pv_entry_t; struct md_page { TAILQ_HEAD(, pv_entry) pv_list; }; #define pmap_page_get_memattr(m) VM_MEMATTR_DEFAULT #define pmap_page_is_mapped(m) (!TAILQ_EMPTY(&(m)->md.pv_list)) #endif /* AIM */ extern struct pmap kernel_pmap_store; #define kernel_pmap (&kernel_pmap_store) #ifdef _KERNEL #define PMAP_LOCK(pmap) mtx_lock(&(pmap)->pm_mtx) #define PMAP_LOCK_ASSERT(pmap, type) \ mtx_assert(&(pmap)->pm_mtx, (type)) #define PMAP_LOCK_DESTROY(pmap) mtx_destroy(&(pmap)->pm_mtx) #define PMAP_LOCK_INIT(pmap) mtx_init(&(pmap)->pm_mtx, \ (pmap == kernel_pmap) ? "kernelpmap" : \ "pmap", NULL, MTX_DEF) #define PMAP_LOCKED(pmap) mtx_owned(&(pmap)->pm_mtx) #define PMAP_MTX(pmap) (&(pmap)->pm_mtx) #define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx) #define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx) #define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0) void pmap_bootstrap(vm_offset_t, vm_offset_t); void pmap_kenter(vm_offset_t va, vm_paddr_t pa); void pmap_kenter_attr(vm_offset_t va, vm_offset_t pa, vm_memattr_t); void pmap_kremove(vm_offset_t); void *pmap_mapdev(vm_paddr_t, vm_size_t); void *pmap_mapdev_attr(vm_offset_t, vm_size_t, vm_memattr_t); void pmap_unmapdev(vm_offset_t, vm_size_t); void pmap_page_set_memattr(vm_page_t, vm_memattr_t); void pmap_deactivate(struct thread *); vm_paddr_t pmap_kextract(vm_offset_t); int pmap_dev_direct_mapped(vm_paddr_t, vm_size_t); boolean_t pmap_mmu_install(char *name, int prio); #define vtophys(va) pmap_kextract((vm_offset_t)(va)) #define PHYS_AVAIL_SZ 256 /* Allows up to 16GB Ram on pSeries with * logical memory block size of 64MB. * For more Ram increase the lmb or this value. */ extern vm_offset_t phys_avail[PHYS_AVAIL_SZ]; extern vm_offset_t virtual_avail; extern vm_offset_t virtual_end; extern vm_offset_t msgbuf_phys; extern int pmap_bootstrapped; vm_offset_t pmap_early_io_map(vm_paddr_t pa, vm_size_t size); #endif #endif /* !_MACHINE_PMAP_H_ */ Index: head/sys/powerpc/ps3/mmu_ps3.c =================================================================== --- head/sys/powerpc/ps3/mmu_ps3.c (revision 279251) +++ head/sys/powerpc/ps3/mmu_ps3.c (revision 279252) @@ -1,310 +1,283 @@ /*- * Copyright (C) 2010 Nathan Whitehorn * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmu_if.h" #include "moea64_if.h" #include "ps3-hvcall.h" #define VSID_HASH_MASK 0x0000007fffffffffUL #define PTESYNC() __asm __volatile("ptesync") extern int ps3fb_remap(void); static uint64_t mps3_vas_id; /* * Kernel MMU interface */ static void mps3_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend); static void mps3_cpu_bootstrap(mmu_t mmup, int ap); -static void mps3_pte_synch(mmu_t, uintptr_t pt, struct lpte *pvo_pt); -static void mps3_pte_clear(mmu_t, uintptr_t pt, struct lpte *pvo_pt, - uint64_t vpn, uint64_t ptebit); -static void mps3_pte_unset(mmu_t, uintptr_t pt, struct lpte *pvo_pt, - uint64_t vpn); -static void mps3_pte_change(mmu_t, uintptr_t pt, struct lpte *pvo_pt, - uint64_t vpn); -static int mps3_pte_insert(mmu_t, u_int ptegidx, struct lpte *pvo_pt); -static uintptr_t mps3_pvo_to_pte(mmu_t, const struct pvo_entry *pvo); +static int64_t mps3_pte_synch(mmu_t, struct pvo_entry *); +static int64_t mps3_pte_clear(mmu_t, struct pvo_entry *, uint64_t ptebit); +static int64_t mps3_pte_unset(mmu_t, struct pvo_entry *); +static int mps3_pte_insert(mmu_t, struct pvo_entry *); static mmu_method_t mps3_methods[] = { MMUMETHOD(mmu_bootstrap, mps3_bootstrap), MMUMETHOD(mmu_cpu_bootstrap, mps3_cpu_bootstrap), MMUMETHOD(moea64_pte_synch, mps3_pte_synch), MMUMETHOD(moea64_pte_clear, mps3_pte_clear), MMUMETHOD(moea64_pte_unset, mps3_pte_unset), - MMUMETHOD(moea64_pte_change, mps3_pte_change), MMUMETHOD(moea64_pte_insert, mps3_pte_insert), - MMUMETHOD(moea64_pvo_to_pte, mps3_pvo_to_pte), { 0, 0 } }; MMU_DEF_INHERIT(ps3_mmu, "mmu_ps3", mps3_methods, 0, oea64_mmu); +static struct mtx mps3_table_lock; + static void mps3_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { uint64_t final_pteg_count; + mtx_init(&mps3_table_lock, "page table", NULL, MTX_DEF); + moea64_early_bootstrap(mmup, kernelstart, kernelend); lv1_construct_virtual_address_space( 20 /* log_2(moea64_pteg_count) */, 2 /* n page sizes */, (24UL << 56) | (16UL << 48) /* page sizes 16 MB + 64 KB */, &mps3_vas_id, &final_pteg_count ); moea64_pteg_count = final_pteg_count / sizeof(struct lpteg); moea64_mid_bootstrap(mmup, kernelstart, kernelend); moea64_late_bootstrap(mmup, kernelstart, kernelend); } static void mps3_cpu_bootstrap(mmu_t mmup, int ap) { struct slb *slb = PCPU_GET(slb); register_t seg0; int i; mtmsr(mfmsr() & ~PSL_DR & ~PSL_IR); /* * Destroy the loader's address space if we are coming up for * the first time, and redo the FB mapping so we can continue * having a console. */ if (!ap) lv1_destruct_virtual_address_space(0); lv1_select_virtual_address_space(mps3_vas_id); if (!ap) ps3fb_remap(); /* * Install kernel SLB entries */ __asm __volatile ("slbia"); __asm __volatile ("slbmfee %0,%1; slbie %0;" : "=r"(seg0) : "r"(0)); for (i = 0; i < 64; i++) { if (!(slb[i].slbe & SLBE_VALID)) continue; __asm __volatile ("slbmte %0, %1" :: "r"(slb[i].slbv), "r"(slb[i].slbe)); } } -static void -mps3_pte_synch(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt) +static int64_t +mps3_pte_synch_locked(struct pvo_entry *pvo) { uint64_t halfbucket[4], rcbits; PTESYNC(); - lv1_read_htab_entries(mps3_vas_id, slot & ~0x3UL, &halfbucket[0], - &halfbucket[1], &halfbucket[2], &halfbucket[3], &rcbits); + lv1_read_htab_entries(mps3_vas_id, pvo->pvo_pte.slot & ~0x3UL, + &halfbucket[0], &halfbucket[1], &halfbucket[2], &halfbucket[3], + &rcbits); + /* Check if present in page table */ + if ((halfbucket[pvo->pvo_pte.slot & 0x3] & LPTE_AVPN_MASK) != + ((pvo->pvo_vpn >> (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) & + LPTE_AVPN_MASK)) + return (-1); + if (!(halfbucket[pvo->pvo_pte.slot & 0x3] & LPTE_VALID)) + return (-1); + /* - * rcbits contains the low 12 bits of each PTEs 2nd part, + * rcbits contains the low 12 bits of each PTE's 2nd part, * spaced at 16-bit intervals */ - KASSERT((halfbucket[slot & 0x3] & LPTE_AVPN_MASK) == - (pvo_pt->pte_hi & LPTE_AVPN_MASK), - ("PTE upper word %#lx != %#lx\n", - halfbucket[slot & 0x3], pvo_pt->pte_hi)); - - pvo_pt->pte_lo |= (rcbits >> ((3 - (slot & 0x3))*16)) & - (LPTE_CHG | LPTE_REF); + return ((rcbits >> ((3 - (pvo->pvo_pte.slot & 0x3))*16)) & + (LPTE_CHG | LPTE_REF)); } -static void -mps3_pte_clear(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt, uint64_t vpn, - u_int64_t ptebit) +static int64_t +mps3_pte_synch(mmu_t mmu, struct pvo_entry *pvo) { + int64_t retval; - lv1_write_htab_entry(mps3_vas_id, slot, pvo_pt->pte_hi, - pvo_pt->pte_lo & ~ptebit); + mtx_lock(&mps3_table_lock); + retval = mps3_pte_synch_locked(pvo); + mtx_unlock(&mps3_table_lock); + + return (retval); } -static void -mps3_pte_unset(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt, uint64_t vpn) +static int64_t +mps3_pte_clear(mmu_t mmu, struct pvo_entry *pvo, uint64_t ptebit) { + int64_t refchg; + struct lpte pte; - mps3_pte_synch(mmu, slot, pvo_pt); - pvo_pt->pte_hi &= ~LPTE_VALID; - lv1_write_htab_entry(mps3_vas_id, slot, 0, 0); - moea64_pte_valid--; + mtx_lock(&mps3_table_lock); + + refchg = mps3_pte_synch_locked(pvo); + if (refchg < 0) { + mtx_unlock(&mps3_table_lock); + return (refchg); + } + + moea64_pte_from_pvo(pvo, &pte); + + pte.pte_lo |= refchg; + pte.pte_lo &= ~ptebit; + /* XXX: race on RC bits between write and sync. Anything to do? */ + lv1_write_htab_entry(mps3_vas_id, pvo->pvo_pte.slot, pte.pte_hi, + pte.pte_lo); + mtx_unlock(&mps3_table_lock); + + return (refchg); } -static void -mps3_pte_change(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt, uint64_t vpn) +static int64_t +mps3_pte_unset(mmu_t mmu, struct pvo_entry *pvo) { - - mps3_pte_synch(mmu, slot, pvo_pt); - lv1_write_htab_entry(mps3_vas_id, slot, pvo_pt->pte_hi, - pvo_pt->pte_lo); + int64_t refchg; + + mtx_lock(&mps3_table_lock); + refchg = mps3_pte_synch_locked(pvo); + if (refchg < 0) { + moea64_pte_overflow--; + mtx_unlock(&mps3_table_lock); + return (-1); + } + /* XXX: race on RC bits between unset and sync. Anything to do? */ + lv1_write_htab_entry(mps3_vas_id, pvo->pvo_pte.slot, 0, 0); + mtx_unlock(&mps3_table_lock); + moea64_pte_valid--; + + return (refchg & (LPTE_REF | LPTE_CHG)); } static int -mps3_pte_insert(mmu_t mmu, u_int ptegidx, struct lpte *pvo_pt) +mps3_pte_insert(mmu_t mmu, struct pvo_entry *pvo) { int result; - struct lpte evicted; - struct pvo_entry *pvo; + struct lpte pte, evicted; uint64_t index; - pvo_pt->pte_hi |= LPTE_VALID; - pvo_pt->pte_hi &= ~LPTE_HID; + if (pvo->pvo_vaddr & PVO_HID) { + /* Hypercall needs primary PTEG */ + pvo->pvo_vaddr &= ~PVO_HID; + pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); + } + + pvo->pvo_pte.slot &= ~7UL; + moea64_pte_from_pvo(pvo, &pte); evicted.pte_hi = 0; PTESYNC(); - result = lv1_insert_htab_entry(mps3_vas_id, ptegidx << 3, - pvo_pt->pte_hi, pvo_pt->pte_lo, LPTE_LOCKED | LPTE_WIRED, 0, + mtx_lock(&mps3_table_lock); + result = lv1_insert_htab_entry(mps3_vas_id, pvo->pvo_pte.slot, + pte.pte_hi, pte.pte_lo, LPTE_LOCKED | LPTE_WIRED, 0, &index, &evicted.pte_hi, &evicted.pte_lo); + mtx_unlock(&mps3_table_lock); if (result != 0) { /* No freeable slots in either PTEG? We're hosed. */ panic("mps3_pte_insert: overflow (%d)", result); return (-1); } /* * See where we ended up. */ - if (index >> 3 != ptegidx) - pvo_pt->pte_hi |= LPTE_HID; + if ((index & ~7UL) != pvo->pvo_pte.slot) + pvo->pvo_vaddr |= PVO_HID; + pvo->pvo_pte.slot = index; moea64_pte_valid++; - if (!evicted.pte_hi) - return (index & 0x7); - - /* - * Synchronize the sacrifice PTE with its PVO, then mark both - * invalid. The PVO will be reused when/if the VM system comes - * here after a fault. - */ - - ptegidx = index >> 3; /* Where the sacrifice PTE was found */ - if (evicted.pte_hi & LPTE_HID) - ptegidx ^= moea64_pteg_mask; /* PTEs indexed by primary */ - - KASSERT((evicted.pte_hi & (LPTE_WIRED | LPTE_LOCKED)) == 0, - ("Evicted a wired PTE")); - - result = 0; - LIST_FOREACH(pvo, &moea64_pvo_table[ptegidx], pvo_olink) { - if (!PVO_PTEGIDX_ISSET(pvo)) - continue; - - if (pvo->pvo_pte.lpte.pte_hi == (evicted.pte_hi | LPTE_VALID)) { - KASSERT(pvo->pvo_pte.lpte.pte_hi & LPTE_VALID, - ("Invalid PVO for valid PTE!")); - pvo->pvo_pte.lpte.pte_hi &= ~LPTE_VALID; - pvo->pvo_pte.lpte.pte_lo |= - evicted.pte_lo & (LPTE_REF | LPTE_CHG); - PVO_PTEGIDX_CLR(pvo); - moea64_pte_valid--; - moea64_pte_overflow++; - result = 1; - break; - } + if (evicted.pte_hi) { + KASSERT((evicted.pte_hi & (LPTE_WIRED | LPTE_LOCKED)) == 0, + ("Evicted a wired PTE")); + moea64_pte_valid--; + moea64_pte_overflow++; } - KASSERT(result == 1, ("PVO for sacrifice PTE not found")); - - return (index & 0x7); -} - -static __inline u_int -va_to_pteg(uint64_t vsid, vm_offset_t addr, int large) -{ - uint64_t hash; - int shift; - - shift = large ? moea64_large_page_shift : ADDR_PIDX_SHFT; - hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)addr & ADDR_PIDX) >> - shift); - return (hash & moea64_pteg_mask); -} - -uintptr_t -mps3_pvo_to_pte(mmu_t mmu, const struct pvo_entry *pvo) -{ - uint64_t vsid; - u_int ptegidx; - - /* If the PTEG index is not set, then there is no page table entry */ - if (!PVO_PTEGIDX_ISSET(pvo)) - return (-1); - - vsid = PVO_VSID(pvo); - ptegidx = va_to_pteg(vsid, PVO_VADDR(pvo), pvo->pvo_vaddr & PVO_LARGE); - - /* - * We can find the actual pte entry without searching by grabbing - * the PTEG index from 3 unused bits in pvo_vaddr and by - * noticing the HID bit. - */ - if (pvo->pvo_pte.lpte.pte_hi & LPTE_HID) - ptegidx ^= moea64_pteg_mask; - - return ((ptegidx << 3) | PVO_PTEGIDX_GET(pvo)); + return (0); } Index: head/sys/powerpc/pseries/mmu_phyp.c =================================================================== --- head/sys/powerpc/pseries/mmu_phyp.c (revision 279251) +++ head/sys/powerpc/pseries/mmu_phyp.c (revision 279252) @@ -1,446 +1,460 @@ /* * Copyright (C) 2010 Andreas Tobler * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include -#include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmu_if.h" #include "moea64_if.h" #include "phyp-hvcall.h" extern int n_slbs; +static struct rwlock mphyp_eviction_lock; + /* * Kernel MMU interface */ static void mphyp_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend); static void mphyp_cpu_bootstrap(mmu_t mmup, int ap); -static void mphyp_pte_synch(mmu_t, uintptr_t pt, struct lpte *pvo_pt); -static void mphyp_pte_clear(mmu_t, uintptr_t pt, struct lpte *pvo_pt, - uint64_t vpn, u_int64_t ptebit); -static void mphyp_pte_unset(mmu_t, uintptr_t pt, struct lpte *pvo_pt, - uint64_t vpn); -static void mphyp_pte_change(mmu_t, uintptr_t pt, struct lpte *pvo_pt, - uint64_t vpn); -static int mphyp_pte_insert(mmu_t, u_int ptegidx, struct lpte *pvo_pt); -static uintptr_t mphyp_pvo_to_pte(mmu_t, const struct pvo_entry *pvo); +static int64_t mphyp_pte_synch(mmu_t, struct pvo_entry *pvo); +static int64_t mphyp_pte_clear(mmu_t, struct pvo_entry *pvo, uint64_t ptebit); +static int64_t mphyp_pte_unset(mmu_t, struct pvo_entry *pvo); +static int mphyp_pte_insert(mmu_t, struct pvo_entry *pvo); -#define VSID_HASH_MASK 0x0000007fffffffffULL - - static mmu_method_t mphyp_methods[] = { MMUMETHOD(mmu_bootstrap, mphyp_bootstrap), MMUMETHOD(mmu_cpu_bootstrap, mphyp_cpu_bootstrap), MMUMETHOD(moea64_pte_synch, mphyp_pte_synch), MMUMETHOD(moea64_pte_clear, mphyp_pte_clear), MMUMETHOD(moea64_pte_unset, mphyp_pte_unset), - MMUMETHOD(moea64_pte_change, mphyp_pte_change), MMUMETHOD(moea64_pte_insert, mphyp_pte_insert), - MMUMETHOD(moea64_pvo_to_pte, mphyp_pvo_to_pte), + /* XXX: pmap_copy_page, pmap_init_page with H_PAGE_INIT */ + { 0, 0 } }; MMU_DEF_INHERIT(pseries_mmu, "mmu_phyp", mphyp_methods, 0, oea64_mmu); +static int brokenkvm = 0; + static void +print_kvm_bug_warning(void *data) +{ + + if (brokenkvm) + printf("WARNING: Running on a broken hypervisor that does " + "not support mandatory H_CLEAR_MOD and H_CLEAR_REF " + "hypercalls. Performance will be suboptimal.\n"); +} + +SYSINIT(kvmbugwarn1, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1, + print_kvm_bug_warning, NULL); +SYSINIT(kvmbugwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1, print_kvm_bug_warning, + NULL); + +static void mphyp_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { uint64_t final_pteg_count = 0; char buf[8]; uint32_t prop[2]; uint32_t nptlp, shift = 0, slb_encoding = 0; uint32_t lp_size, lp_encoding; phandle_t dev, node, root; int idx, len, res; + rw_init(&mphyp_eviction_lock, "pte eviction"); + moea64_early_bootstrap(mmup, kernelstart, kernelend); root = OF_peer(0); dev = OF_child(root); while (dev != 0) { res = OF_getprop(dev, "name", buf, sizeof(buf)); if (res > 0 && strcmp(buf, "cpus") == 0) break; dev = OF_peer(dev); } node = OF_child(dev); while (node != 0) { res = OF_getprop(node, "device_type", buf, sizeof(buf)); if (res > 0 && strcmp(buf, "cpu") == 0) break; node = OF_peer(node); } res = OF_getprop(node, "ibm,pft-size", prop, sizeof(prop)); if (res <= 0) panic("mmu_phyp: unknown PFT size"); final_pteg_count = 1 << prop[1]; res = OF_getprop(node, "ibm,slb-size", prop, sizeof(prop[0])); if (res > 0) n_slbs = prop[0]; moea64_pteg_count = final_pteg_count / sizeof(struct lpteg); /* * Scan the large page size property for PAPR compatible machines. * See PAPR D.5 Changes to Section 5.1.4, 'CPU Node Properties' * for the encoding of the property. */ len = OF_getproplen(node, "ibm,segment-page-sizes"); if (len > 0) { /* * We have to use a variable length array on the stack * since we have very limited stack space. */ pcell_t arr[len/sizeof(cell_t)]; res = OF_getencprop(node, "ibm,segment-page-sizes", arr, sizeof(arr)); len /= 4; idx = 0; while (len > 0) { shift = arr[idx]; slb_encoding = arr[idx + 1]; nptlp = arr[idx + 2]; idx += 3; len -= 3; while (len > 0 && nptlp) { lp_size = arr[idx]; lp_encoding = arr[idx+1]; if (slb_encoding == SLBV_L && lp_encoding == 0) break; idx += 2; len -= 2; nptlp--; } if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0) break; } if (len == 0) panic("Standard large pages (SLB[L] = 1, PTE[LP] = 0) " "not supported by this system. Please enable huge " "page backing if running under PowerKVM."); moea64_large_page_shift = shift; moea64_large_page_size = 1ULL << lp_size; } moea64_mid_bootstrap(mmup, kernelstart, kernelend); moea64_late_bootstrap(mmup, kernelstart, kernelend); + + /* Test for broken versions of KVM that don't conform to the spec */ + if (phyp_hcall(H_CLEAR_MOD, 0, 0) == H_FUNCTION) + brokenkvm = 1; } static void mphyp_cpu_bootstrap(mmu_t mmup, int ap) { struct slb *slb = PCPU_GET(slb); register_t seg0; int i; /* * Install kernel SLB entries */ __asm __volatile ("slbia"); __asm __volatile ("slbmfee %0,%1; slbie %0;" : "=r"(seg0) : "r"(0)); for (i = 0; i < 64; i++) { if (!(slb[i].slbe & SLBE_VALID)) continue; __asm __volatile ("slbmte %0, %1" :: "r"(slb[i].slbv), "r"(slb[i].slbe)); } } -static void -mphyp_pte_synch(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt) +static int64_t +mphyp_pte_synch(mmu_t mmu, struct pvo_entry *pvo) { struct lpte pte; uint64_t junk; __asm __volatile("ptesync"); - phyp_pft_hcall(H_READ, 0, slot, 0, 0, &pte.pte_hi, &pte.pte_lo, - &junk); + phyp_pft_hcall(H_READ, 0, pvo->pvo_pte.slot, 0, 0, &pte.pte_hi, + &pte.pte_lo, &junk); + if ((pte.pte_hi & LPTE_AVPN_MASK) != + ((pvo->pvo_vpn >> (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) & + LPTE_AVPN_MASK)) + return (-1); + if (!(pte.pte_hi & LPTE_VALID)) + return (-1); - pvo_pt->pte_lo |= pte.pte_lo & (LPTE_CHG | LPTE_REF); + return (pte.pte_lo & (LPTE_CHG | LPTE_REF)); } -static void -mphyp_pte_clear(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt, uint64_t vpn, - u_int64_t ptebit) +static int64_t +mphyp_pte_clear(mmu_t mmu, struct pvo_entry *pvo, uint64_t ptebit) { + int64_t refchg; + uint64_t ptelo, junk; + int err; - if (ptebit & LPTE_CHG) - phyp_hcall(H_CLEAR_MOD, 0, slot); - if (ptebit & LPTE_REF) - phyp_hcall(H_CLEAR_REF, 0, slot); + /* + * This involves two steps (synch and clear) so we need the entry + * not to change in the middle. We are protected against deliberate + * unset by virtue of holding the pmap lock. Protection against + * incidental unset (page table eviction) comes from holding the + * shared eviction lock. + */ + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + rw_rlock(&mphyp_eviction_lock); + + refchg = mphyp_pte_synch(mmu, pvo); + if (refchg < 0) { + rw_runlock(&mphyp_eviction_lock); + return (refchg); + } + + if (brokenkvm) { + /* + * No way to clear either bit, which is total madness. + * Pessimistically claim that, once modified, it stays so + * forever and that it is never referenced. + */ + rw_runlock(&mphyp_eviction_lock); + return (refchg & ~LPTE_REF); + } + + if (ptebit & LPTE_CHG) { + err = phyp_pft_hcall(H_CLEAR_MOD, 0, pvo->pvo_pte.slot, 0, 0, + &ptelo, &junk, &junk); + KASSERT(err == H_SUCCESS, + ("Error clearing page change bit: %d", err)); + refchg |= (ptelo & LPTE_CHG); + } + if (ptebit & LPTE_REF) { + err = phyp_pft_hcall(H_CLEAR_REF, 0, pvo->pvo_pte.slot, 0, 0, + &ptelo, &junk, &junk); + KASSERT(err == H_SUCCESS, + ("Error clearing page reference bit: %d", err)); + refchg |= (ptelo & LPTE_REF); + } + + rw_runlock(&mphyp_eviction_lock); + + return (refchg); } -static void -mphyp_pte_unset(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt, uint64_t vpn) +static int64_t +mphyp_pte_unset(mmu_t mmu, struct pvo_entry *pvo) { struct lpte pte; uint64_t junk; int err; - pvo_pt->pte_hi &= ~LPTE_VALID; - err = phyp_pft_hcall(H_REMOVE, 1UL << 31, slot, - pvo_pt->pte_hi & LPTE_AVPN_MASK, 0, &pte.pte_hi, &pte.pte_lo, - &junk); - KASSERT(err == H_SUCCESS, ("Error removing page: %d", err)); + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); - pvo_pt->pte_lo |= pte.pte_lo & (LPTE_CHG | LPTE_REF); -} + moea64_pte_from_pvo(pvo, &pte); -static void -mphyp_pte_change(mmu_t mmu, uintptr_t slot, struct lpte *pvo_pt, uint64_t vpn) -{ - struct lpte evicted; - uint64_t index, junk; - int64_t result; + err = phyp_pft_hcall(H_REMOVE, H_AVPN, pvo->pvo_pte.slot, + pte.pte_hi & LPTE_AVPN_MASK, 0, &pte.pte_hi, &pte.pte_lo, + &junk); + KASSERT(err == H_SUCCESS || err == H_NOT_FOUND, + ("Error removing page: %d", err)); - /* - * NB: this is protected by the global table lock, so this two-step - * is safe, except for the scratch-page case. No CPUs on which we run - * this code should be using scratch pages. - */ - KASSERT(!(pvo_pt->pte_hi & LPTE_LOCKED), - ("Locked pages not supported on PHYP")); + if (err == H_NOT_FOUND) { + moea64_pte_overflow--; + return (-1); + } - /* XXX: optimization using H_PROTECT for common case? */ - mphyp_pte_unset(mmu, slot, pvo_pt, vpn); - pvo_pt->pte_hi |= LPTE_VALID; - result = phyp_pft_hcall(H_ENTER, H_EXACT, slot, pvo_pt->pte_hi, - pvo_pt->pte_lo, &index, &evicted.pte_lo, &junk); - if (result != H_SUCCESS) - panic("mphyp_pte_change() insertion failure: %ld\n", result); + return (pte.pte_lo & (LPTE_REF | LPTE_CHG)); } -static __inline int -mphyp_pte_spillable_ident(u_int ptegidx, struct lpte *to_evict) +static uintptr_t +mphyp_pte_spillable_ident(uintptr_t ptegbase, struct lpte *to_evict) { uint64_t slot, junk, k; struct lpte pt; int i, j; /* Start at a random slot */ i = mftb() % 8; k = -1; for (j = 0; j < 8; j++) { - slot = (ptegidx << 3) + (i + j) % 8; - phyp_pft_hcall(H_READ, 0, slot, 0, 0, &pt.pte_hi, &pt.pte_lo, - &junk); + slot = ptegbase + (i + j) % 8; + phyp_pft_hcall(H_READ, 0, slot, 0, 0, &pt.pte_hi, + &pt.pte_lo, &junk); if (pt.pte_hi & LPTE_WIRED) continue; /* This is a candidate, so remember it */ k = slot; /* Try to get a page that has not been used lately */ - if (!(pt.pte_lo & LPTE_REF)) { + if (!(pt.pte_hi & LPTE_VALID) || !(pt.pte_lo & LPTE_REF)) { memcpy(to_evict, &pt, sizeof(struct lpte)); return (k); } } if (k == -1) return (k); phyp_pft_hcall(H_READ, 0, k, 0, 0, &to_evict->pte_hi, &to_evict->pte_lo, &junk); return (k); } static int -mphyp_pte_insert(mmu_t mmu, u_int ptegidx, struct lpte *pvo_pt) +mphyp_pte_insert(mmu_t mmu, struct pvo_entry *pvo) { int64_t result; - struct lpte evicted; - struct pvo_entry *pvo; - uint64_t index, junk; - u_int pteg_bktidx; + struct lpte evicted, pte; + uint64_t index, junk, lastptelo; - /* Check for locked pages, which we can't support on this system */ - KASSERT(!(pvo_pt->pte_hi & LPTE_LOCKED), - ("Locked pages not supported on PHYP")); + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); /* Initialize PTE */ - pvo_pt->pte_hi |= LPTE_VALID; - pvo_pt->pte_hi &= ~LPTE_HID; + moea64_pte_from_pvo(pvo, &pte); evicted.pte_hi = 0; + /* Make sure further insertion is locked out during evictions */ + rw_rlock(&mphyp_eviction_lock); + /* * First try primary hash. */ - pteg_bktidx = ptegidx; - result = phyp_pft_hcall(H_ENTER, 0, pteg_bktidx << 3, pvo_pt->pte_hi, - pvo_pt->pte_lo, &index, &evicted.pte_lo, &junk); - if (result == H_SUCCESS) - return (index & 0x07); + pvo->pvo_pte.slot &= ~7UL; /* Base slot address */ + result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, pte.pte_hi, + pte.pte_lo, &index, &evicted.pte_lo, &junk); + if (result == H_SUCCESS) { + rw_runlock(&mphyp_eviction_lock); + pvo->pvo_pte.slot = index; + return (0); + } KASSERT(result == H_PTEG_FULL, ("Page insertion error: %ld " - "(ptegidx: %#x/%#x, PTE %#lx/%#lx", result, ptegidx, - moea64_pteg_count, pvo_pt->pte_hi, pvo_pt->pte_lo)); + "(ptegidx: %#zx/%#x, PTE %#lx/%#lx", result, pvo->pvo_pte.slot, + moea64_pteg_count, pte.pte_hi, pte.pte_lo)); /* * Next try secondary hash. */ - pteg_bktidx ^= moea64_pteg_mask; - pvo_pt->pte_hi |= LPTE_HID; - result = phyp_pft_hcall(H_ENTER, 0, pteg_bktidx << 3, - pvo_pt->pte_hi, pvo_pt->pte_lo, &index, &evicted.pte_lo, &junk); - if (result == H_SUCCESS) - return (index & 0x07); + pvo->pvo_vaddr ^= PVO_HID; + pte.pte_hi ^= LPTE_HID; + pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); + + result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, + pte.pte_hi, pte.pte_lo, &index, &evicted.pte_lo, &junk); + if (result == H_SUCCESS) { + rw_runlock(&mphyp_eviction_lock); + pvo->pvo_pte.slot = index; + return (0); + } KASSERT(result == H_PTEG_FULL, ("Secondary page insertion error: %ld", result)); /* * Out of luck. Find a PTE to sacrifice. */ - pteg_bktidx = ptegidx; - index = mphyp_pte_spillable_ident(pteg_bktidx, &evicted); + + /* Lock out all insertions for a bit */ + if (!rw_try_upgrade(&mphyp_eviction_lock)) { + rw_runlock(&mphyp_eviction_lock); + rw_wlock(&mphyp_eviction_lock); + } + + index = mphyp_pte_spillable_ident(pvo->pvo_pte.slot, &evicted); if (index == -1L) { - pteg_bktidx ^= moea64_pteg_mask; - index = mphyp_pte_spillable_ident(pteg_bktidx, &evicted); + /* Try other hash table? */ + pvo->pvo_vaddr ^= PVO_HID; + pte.pte_hi ^= LPTE_HID; + pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); + index = mphyp_pte_spillable_ident(pvo->pvo_pte.slot, &evicted); } if (index == -1L) { /* No freeable slots in either PTEG? We're hosed. */ + rw_wunlock(&mphyp_eviction_lock); panic("mphyp_pte_insert: overflow"); return (-1); } - if (pteg_bktidx == ptegidx) - pvo_pt->pte_hi &= ~LPTE_HID; - else - pvo_pt->pte_hi |= LPTE_HID; - - /* - * Synchronize the sacrifice PTE with its PVO, then mark both - * invalid. The PVO will be reused when/if the VM system comes - * here after a fault. - */ - - if (evicted.pte_hi & LPTE_HID) - pteg_bktidx ^= moea64_pteg_mask; /* PTEs indexed by primary */ - - LIST_FOREACH(pvo, &moea64_pvo_table[pteg_bktidx], pvo_olink) { - if (pvo->pvo_pte.lpte.pte_hi == evicted.pte_hi) { - KASSERT(pvo->pvo_pte.lpte.pte_hi & LPTE_VALID, - ("Invalid PVO for valid PTE!")); - mphyp_pte_unset(mmu, index, &pvo->pvo_pte.lpte, - pvo->pvo_vpn); - PVO_PTEGIDX_CLR(pvo); - moea64_pte_overflow++; - break; - } + /* Victim acquired: update page before waving goodbye */ + if (evicted.pte_hi & LPTE_VALID) { + result = phyp_pft_hcall(H_REMOVE, H_AVPN, index, + evicted.pte_hi & LPTE_AVPN_MASK, 0, &junk, &lastptelo, + &junk); + moea64_pte_overflow++; + KASSERT(result == H_SUCCESS, + ("Error evicting page: %d", (int)result)); } - KASSERT((pvo->pvo_pte.lpte.pte_hi | LPTE_VALID) == evicted.pte_hi, - ("Unable to find PVO for spilled PTE")); - /* * Set the new PTE. */ - result = phyp_pft_hcall(H_ENTER, H_EXACT, index, pvo_pt->pte_hi, - pvo_pt->pte_lo, &index, &evicted.pte_lo, &junk); + result = phyp_pft_hcall(H_ENTER, H_EXACT, index, pte.pte_hi, + pte.pte_lo, &index, &evicted.pte_lo, &junk); + rw_wunlock(&mphyp_eviction_lock); /* All clear */ + + pvo->pvo_pte.slot = index; if (result == H_SUCCESS) - return (index & 0x07); + return (0); panic("Page replacement error: %ld", result); - return (-1); -} - -static __inline u_int -va_to_pteg(uint64_t vsid, vm_offset_t addr, int large) -{ - uint64_t hash; - int shift; - - shift = large ? moea64_large_page_shift : ADDR_PIDX_SHFT; - hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)addr & ADDR_PIDX) >> - shift); - return (hash & moea64_pteg_mask); -} - -static uintptr_t -mphyp_pvo_to_pte(mmu_t mmu, const struct pvo_entry *pvo) -{ - uint64_t vsid; - u_int ptegidx; - - /* If the PTEG index is not set, then there is no page table entry */ - if (!PVO_PTEGIDX_ISSET(pvo)) - return (-1); - - vsid = PVO_VSID(pvo); - ptegidx = va_to_pteg(vsid, PVO_VADDR(pvo), pvo->pvo_vaddr & PVO_LARGE); - - /* - * We can find the actual pte entry without searching by grabbing - * the PTEG index from 3 unused bits in pvo_vaddr and by - * noticing the HID bit. - */ - if (pvo->pvo_pte.lpte.pte_hi & LPTE_HID) - ptegidx ^= moea64_pteg_mask; - - return ((ptegidx << 3) | PVO_PTEGIDX_GET(pvo)); + return (result); }