Index: head/sys/conf/files.powerpc =================================================================== --- head/sys/conf/files.powerpc (revision 360886) +++ head/sys/conf/files.powerpc (revision 360887) @@ -1,313 +1,314 @@ # This file tells config what files go into building a kernel, # files marked standard are always included. # # $FreeBSD$ # # The long compile-with and dependency lines are required because of # limitations in config: backslash-newline doesn't work in strings, and # dependency lines other than the first are silently ignored. # # # There is only an asm version on ppc64. cddl/compat/opensolaris/kern/opensolaris_atomic.c optional zfs powerpc | dtrace powerpc | zfs powerpcspe | dtrace powerpcspe compile-with "${ZFS_C}" cddl/dev/dtrace/powerpc/dtrace_asm.S optional dtrace compile-with "${DTRACE_S}" cddl/dev/dtrace/powerpc/dtrace_subr.c optional dtrace compile-with "${DTRACE_C}" cddl/dev/fbt/powerpc/fbt_isa.c optional dtrace_fbt | dtraceall compile-with "${FBT_C}" crypto/blowfish/bf_enc.c optional crypto | ipsec | ipsec_support crypto/des/des_enc.c optional crypto | ipsec | ipsec_support | netsmb dev/aacraid/aacraid_endian.c optional aacraid dev/adb/adb_bus.c optional adb dev/adb/adb_kbd.c optional adb dev/adb/adb_mouse.c optional adb dev/adb/adb_hb_if.m optional adb dev/adb/adb_if.m optional adb dev/adb/adb_buttons.c optional adb dev/agp/agp_apple.c optional agp powermac dev/fb/fb.c optional sc dev/hwpmc/hwpmc_e500.c optional hwpmc dev/hwpmc/hwpmc_mpc7xxx.c optional hwpmc dev/hwpmc/hwpmc_powerpc.c optional hwpmc dev/hwpmc/hwpmc_ppc970.c optional hwpmc dev/iicbus/ad7417.c optional ad7417 powermac dev/iicbus/adm1030.c optional powermac windtunnel | adm1030 powermac dev/iicbus/adt746x.c optional adt746x powermac dev/iicbus/ds1631.c optional ds1631 powermac dev/iicbus/ds1775.c optional ds1775 powermac dev/iicbus/max6690.c optional max6690 powermac dev/iicbus/ofw_iicbus.c optional iicbus aim dev/ipmi/ipmi.c optional ipmi dev/ipmi/ipmi_opal.c optional powernv ipmi dev/ixl/if_ixl.c optional ixl pci powerpc64 \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/ixl_pf_main.c optional ixl pci powerpc64 \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/ixl_pf_qmgr.c optional ixl pci powerpc64 \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/ixl_pf_iov.c optional ixl pci pci_iov powerpc64 \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/ixl_pf_i2c.c optional ixl pci powerpc64 \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/if_iavf.c optional iavf pci powerpc64 \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/iavf_vc.c optional iavf pci powerpc64 \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/ixl_txrx.c optional ixl pci powerpc64 | \ iavf pci powerpc64 \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/i40e_osdep.c optional ixl pci powerpc64 | \ iavf pci powerpc64 \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/i40e_lan_hmc.c optional ixl pci powerpc64 | \ iavf pci powerpc64 \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/i40e_hmc.c optional ixl pci powerpc64 | \ iavf pci powerpc64 \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/i40e_common.c optional ixl pci powerpc64 | \ iavf pci powerpc64 \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/i40e_nvm.c optional ixl pci powerpc64 | \ iavf pci powerpc64 \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/i40e_adminq.c optional ixl pci powerpc64 | \ iavf pci powerpc64 \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/i40e_dcb.c optional ixl pci powerpc64 \ compile-with "${NORMAL_C} -I$S/dev/ixl" # Most ofw stuff below is brought in by conf/files for options FDT, but # we always want it, even on non-FDT platforms. dev/fdt/simplebus.c standard dev/ofw/openfirm.c standard dev/ofw/openfirmio.c standard dev/ofw/ofw_bus_if.m standard dev/ofw/ofw_cpu.c standard dev/ofw/ofw_if.m standard dev/ofw/ofw_bus_subr.c standard dev/ofw/ofw_console.c optional aim dev/ofw/ofw_disk.c optional ofwd aim dev/ofw/ofwbus.c standard dev/ofw/ofwpci.c optional pci dev/ofw/ofw_standard.c optional aim powerpc dev/ofw/ofw_subr.c standard dev/powermac_nvram/powermac_nvram.c optional powermac_nvram powermac dev/quicc/quicc_bfe_fdt.c optional quicc mpc85xx dev/random/darn.c optional powerpc64 !random_loadable dev/scc/scc_bfe_macio.c optional scc powermac dev/sdhci/fsl_sdhci.c optional mpc85xx sdhci dev/sec/sec.c optional sec mpc85xx dev/sound/macio/aoa.c optional snd_davbus | snd_ai2s powermac dev/sound/macio/davbus.c optional snd_davbus powermac dev/sound/macio/i2s.c optional snd_ai2s powermac dev/sound/macio/onyx.c optional snd_ai2s iicbus powermac dev/sound/macio/snapper.c optional snd_ai2s iicbus powermac dev/sound/macio/tumbler.c optional snd_ai2s iicbus powermac dev/syscons/scgfbrndr.c optional sc dev/tsec/if_tsec.c optional tsec dev/tsec/if_tsec_fdt.c optional tsec dev/uart/uart_cpu_powerpc.c optional uart dev/usb/controller/ehci_fsl.c optional ehci mpc85xx dev/vt/hw/ofwfb/ofwfb.c optional vt aim kern/kern_clocksource.c standard kern/subr_atomic64.c optional powerpc | powerpcspe kern/subr_dummy_vdso_tc.c standard kern/syscalls.c optional ktr kern/subr_sfbuf.c standard libkern/ashldi3.c optional powerpc | powerpcspe libkern/ashrdi3.c optional powerpc | powerpcspe libkern/bcmp.c standard libkern/bcopy.c standard libkern/cmpdi2.c optional powerpc | powerpcspe libkern/divdi3.c optional powerpc | powerpcspe libkern/ffs.c standard libkern/ffsl.c standard libkern/ffsll.c standard libkern/flsll.c standard libkern/lshrdi3.c optional powerpc | powerpcspe libkern/memcmp.c standard libkern/memset.c standard libkern/moddi3.c optional powerpc | powerpcspe libkern/qdivrem.c optional powerpc | powerpcspe libkern/ucmpdi2.c optional powerpc | powerpcspe libkern/udivdi3.c optional powerpc | powerpcspe libkern/umoddi3.c optional powerpc | powerpcspe powerpc/aim/locore.S optional aim no-obj powerpc/aim/aim_machdep.c optional aim powerpc/aim/mmu_oea.c optional aim powerpc powerpc/aim/mmu_oea64.c optional aim +powerpc/aim/mmu_radix.c optional aim powerpc64 powerpc/aim/moea64_if.m optional aim powerpc/aim/moea64_native.c optional aim powerpc/aim/mp_cpudep.c optional aim powerpc/aim/slb.c optional aim powerpc64 powerpc/amigaone/platform_amigaone.c optional amigaone powerpc/amigaone/cpld_a1222.c optional powerpc amigaone | powerpcspe amigaone powerpc/amigaone/cpld_x5000.c optional powerpc amigaone | powerpc64 amigaone powerpc/booke/locore.S optional booke no-obj powerpc/booke/booke_machdep.c optional booke powerpc/booke/machdep_e500.c optional booke_e500 powerpc/booke/mp_cpudep.c optional booke smp powerpc/booke/platform_bare.c optional booke powerpc/booke/pmap.c optional booke powerpc/booke/spe.c optional powerpcspe powerpc/cpufreq/dfs.c optional cpufreq powerpc/cpufreq/mpc85xx_jog.c optional cpufreq mpc85xx powerpc/cpufreq/pcr.c optional cpufreq aim powerpc/cpufreq/pmcr.c optional cpufreq aim powerpc64 powerpc/cpufreq/pmufreq.c optional cpufreq aim pmu powerpc/fpu/fpu_add.c optional fpu_emu | powerpcspe powerpc/fpu/fpu_compare.c optional fpu_emu | powerpcspe powerpc/fpu/fpu_div.c optional fpu_emu | powerpcspe powerpc/fpu/fpu_emu.c optional fpu_emu powerpc/fpu/fpu_explode.c optional fpu_emu | powerpcspe powerpc/fpu/fpu_implode.c optional fpu_emu | powerpcspe powerpc/fpu/fpu_mul.c optional fpu_emu | powerpcspe powerpc/fpu/fpu_sqrt.c optional fpu_emu powerpc/fpu/fpu_subr.c optional fpu_emu | powerpcspe powerpc/mambo/mambocall.S optional mambo powerpc/mambo/mambo.c optional mambo powerpc/mambo/mambo_console.c optional mambo powerpc/mambo/mambo_disk.c optional mambo powerpc/mikrotik/platform_rb.c optional mikrotik powerpc/mikrotik/rb_led.c optional mikrotik powerpc/mpc85xx/atpic.c optional mpc85xx isa powerpc/mpc85xx/ds1553_bus_fdt.c optional ds1553 powerpc/mpc85xx/ds1553_core.c optional ds1553 powerpc/mpc85xx/fsl_diu.c optional mpc85xx diu powerpc/mpc85xx/fsl_espi.c optional mpc85xx spibus powerpc/mpc85xx/fsl_sata.c optional mpc85xx ata powerpc/mpc85xx/i2c.c optional mpc85xx iicbus powerpc/mpc85xx/isa.c optional mpc85xx isa powerpc/mpc85xx/lbc.c optional mpc85xx powerpc/mpc85xx/mpc85xx.c optional mpc85xx powerpc/mpc85xx/mpc85xx_cache.c optional mpc85xx powerpc/mpc85xx/mpc85xx_gpio.c optional mpc85xx gpio powerpc/mpc85xx/platform_mpc85xx.c optional mpc85xx powerpc/mpc85xx/pci_mpc85xx.c optional pci mpc85xx powerpc/mpc85xx/pci_mpc85xx_pcib.c optional pci mpc85xx powerpc/mpc85xx/qoriq_gpio.c optional mpc85xx gpio powerpc/ofw/ofw_machdep.c standard powerpc/ofw/ofw_pcibus.c optional pci powerpc/ofw/ofw_pcib_pci.c optional pci powerpc/ofw/ofw_real.c optional aim powerpc/ofw/ofw_syscons.c optional sc aim powerpc/ofw/ofwcall32.S optional aim powerpc powerpc/ofw/ofwcall64.S optional aim powerpc64 powerpc/ofw/openpic_ofw.c standard powerpc/ofw/rtas.c optional aim powerpc/ofw/ofw_initrd.c optional md_root_mem powerpc64 powerpc/powermac/ata_kauai.c optional powermac ata | powermac atamacio powerpc/powermac/ata_macio.c optional powermac ata | powermac atamacio powerpc/powermac/ata_dbdma.c optional powermac ata | powermac atamacio powerpc/powermac/atibl.c optional powermac atibl powerpc/powermac/cuda.c optional powermac cuda powerpc/powermac/cpcht.c optional powermac pci powerpc/powermac/dbdma.c optional powermac pci powerpc/powermac/fcu.c optional powermac fcu powerpc/powermac/grackle.c optional powermac pci powerpc/powermac/hrowpic.c optional powermac pci powerpc/powermac/kiic.c optional powermac kiic powerpc/powermac/macgpio.c optional powermac pci powerpc/powermac/macio.c optional powermac pci powerpc/powermac/nvbl.c optional powermac nvbl powerpc/powermac/platform_powermac.c optional powermac powerpc/powermac/powermac_thermal.c optional powermac powerpc/powermac/pswitch.c optional powermac pswitch powerpc/powermac/pmu.c optional powermac pmu powerpc/powermac/smu.c optional powermac smu powerpc/powermac/smusat.c optional powermac smu powerpc/powermac/uninorth.c optional powermac powerpc/powermac/uninorthpci.c optional powermac pci powerpc/powermac/vcoregpio.c optional powermac powerpc/powernv/opal.c optional powernv powerpc/powernv/opal_async.c optional powernv powerpc/powernv/opal_console.c optional powernv powerpc/powernv/opal_dbg.c optional powernv gdb powerpc/powernv/opal_dev.c optional powernv powerpc/powernv/opal_flash.c optional powernv opalflash powerpc/powernv/opal_hmi.c optional powernv powerpc/powernv/opal_i2c.c optional iicbus fdt powernv powerpc/powernv/opal_i2cm.c optional iicbus fdt powernv powerpc/powernv/opal_nvram.c optional powernv nvram powerpc/powernv/opal_pci.c optional powernv pci powerpc/powernv/opal_sensor.c optional powernv powerpc/powernv/opalcall.S optional powernv powerpc/powernv/platform_powernv.c optional powernv powerpc/powernv/powernv_centaur.c optional powernv powerpc/powernv/powernv_xscom.c optional powernv powerpc/powernv/xive.c optional powernv powerpc/powerpc/altivec.c optional powerpc | powerpc64 powerpc/powerpc/autoconf.c standard powerpc/powerpc/bus_machdep.c standard powerpc/powerpc/busdma_machdep.c standard powerpc/powerpc/clock.c standard powerpc/powerpc/copyinout.c standard powerpc/powerpc/copystr.c standard powerpc/powerpc/cpu.c standard powerpc/powerpc/cpu_subr64.S optional powerpc64 powerpc/powerpc/db_disasm.c optional ddb powerpc/powerpc/db_hwwatch.c optional ddb powerpc/powerpc/db_interface.c optional ddb powerpc/powerpc/db_trace.c optional ddb powerpc/powerpc/dump_machdep.c standard powerpc/powerpc/elf32_machdep.c optional powerpc | powerpcspe | compat_freebsd32 powerpc/powerpc/elf64_machdep.c optional powerpc64 powerpc/powerpc/exec_machdep.c standard powerpc/powerpc/fpu.c standard powerpc/powerpc/gdb_machdep.c optional gdb powerpc/powerpc/in_cksum.c optional inet | inet6 powerpc/powerpc/interrupt.c standard powerpc/powerpc/intr_machdep.c standard powerpc/powerpc/iommu_if.m standard powerpc/powerpc/machdep.c standard powerpc/powerpc/mem.c optional mem powerpc/powerpc/minidump_machdep.c optional powerpc64 powerpc/powerpc/mmu_if.m standard powerpc/powerpc/mp_machdep.c optional smp powerpc/powerpc/nexus.c standard powerpc/powerpc/openpic.c standard powerpc/powerpc/pic_if.m standard powerpc/powerpc/pmap_dispatch.c standard powerpc/powerpc/platform.c standard powerpc/powerpc/platform_if.m standard powerpc/powerpc/ptrace_machdep.c standard powerpc/powerpc/sc_machdep.c optional sc powerpc/powerpc/setjmp.S standard powerpc/powerpc/sigcode32.S optional powerpc | powerpcspe | compat_freebsd32 powerpc/powerpc/sigcode64.S optional powerpc64 powerpc/powerpc/swtch32.S optional powerpc | powerpcspe powerpc/powerpc/swtch64.S optional powerpc64 powerpc/powerpc/stack_machdep.c optional ddb | stack powerpc/powerpc/syncicache.c standard powerpc/powerpc/sys_machdep.c standard powerpc/powerpc/trap.c standard powerpc/powerpc/uio_machdep.c standard powerpc/powerpc/uma_machdep.c standard powerpc/powerpc/vm_machdep.c standard powerpc/ps3/ehci_ps3.c optional ps3 ehci powerpc/ps3/ohci_ps3.c optional ps3 ohci powerpc/ps3/if_glc.c optional ps3 glc powerpc/ps3/mmu_ps3.c optional ps3 powerpc/ps3/platform_ps3.c optional ps3 powerpc/ps3/ps3bus.c optional ps3 powerpc/ps3/ps3cdrom.c optional ps3 scbus powerpc/ps3/ps3disk.c optional ps3 powerpc/ps3/ps3pic.c optional ps3 powerpc/ps3/ps3_syscons.c optional ps3 vt powerpc/ps3/ps3-hvcall.S optional ps3 powerpc/pseries/phyp-hvcall.S optional pseries powerpc64 powerpc/pseries/mmu_phyp.c optional pseries powerpc64 powerpc/pseries/phyp_console.c optional pseries powerpc64 uart powerpc/pseries/phyp_dbg.c optional pseries powerpc64 gdb powerpc/pseries/phyp_llan.c optional llan powerpc/pseries/phyp_vscsi.c optional pseries powerpc64 scbus powerpc/pseries/platform_chrp.c optional pseries powerpc/pseries/plpar_iommu.c optional pseries powerpc64 powerpc/pseries/plpar_pcibus.c optional pseries powerpc64 pci powerpc/pseries/rtas_dev.c optional pseries powerpc/pseries/rtas_pci.c optional pseries pci powerpc/pseries/vdevice.c optional pseries powerpc64 powerpc/pseries/xics.c optional pseries powerpc64 powerpc/psim/iobus.c optional psim powerpc/psim/ata_iobus.c optional ata psim powerpc/psim/openpic_iobus.c optional psim powerpc/psim/uart_iobus.c optional uart psim Index: head/sys/powerpc/aim/aim_machdep.c =================================================================== --- head/sys/powerpc/aim/aim_machdep.c (revision 360886) +++ head/sys/powerpc/aim/aim_machdep.c (revision 360887) @@ -1,750 +1,759 @@ /*- * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (C) 2001 Benno Rice * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * $NetBSD: machdep.c,v 1.74.2.1 2000/11/01 16:13:48 tv Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kstack_pages.h" #include "opt_platform.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef __powerpc64__ #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __powerpc64__ #include "mmu_oea64.h" #endif #ifndef __powerpc64__ struct bat battable[16]; #endif +int radix_mmu = 0; + #ifndef __powerpc64__ /* Bits for running on 64-bit systems in 32-bit mode. */ extern void *testppc64, *testppc64size; extern void *restorebridge, *restorebridgesize; extern void *rfid_patch, *rfi_patch1, *rfi_patch2; extern void *trapcode64; extern Elf_Addr _GLOBAL_OFFSET_TABLE_[]; #endif extern void *rstcode, *rstcodeend; extern void *trapcode, *trapcodeend; extern void *hypertrapcode, *hypertrapcodeend; extern void *generictrap, *generictrap64; extern void *alitrap, *aliend; extern void *dsitrap, *dsiend; extern void *decrint, *decrsize; extern void *extint, *extsize; extern void *dblow, *dbend; extern void *imisstrap, *imisssize; extern void *dlmisstrap, *dlmisssize; extern void *dsmisstrap, *dsmisssize; extern void *ap_pcpu; extern void __restartkernel(vm_offset_t, vm_offset_t, vm_offset_t, void *, uint32_t, register_t offset, register_t msr); extern void __restartkernel_virtual(vm_offset_t, vm_offset_t, vm_offset_t, void *, uint32_t, register_t offset, register_t msr); void aim_early_init(vm_offset_t fdt, vm_offset_t toc, vm_offset_t ofentry, void *mdp, uint32_t mdp_cookie); void aim_cpu_init(vm_offset_t toc); void aim_early_init(vm_offset_t fdt, vm_offset_t toc, vm_offset_t ofentry, void *mdp, uint32_t mdp_cookie) { register_t scratch; /* * If running from an FDT, make sure we are in real mode to avoid * tromping on firmware page tables. Everything in the kernel assumes * 1:1 mappings out of firmware, so this won't break anything not * already broken. This doesn't work if there is live OF, since OF * may internally use non-1:1 mappings. */ if (ofentry == 0) mtmsr(mfmsr() & ~(PSL_IR | PSL_DR)); #ifdef __powerpc64__ /* * Relocate to high memory so that the kernel * can execute from the direct map. * * If we are in virtual mode already, use a special entry point * that sets up a temporary DMAP to execute from until we can * properly set up the MMU. */ if ((vm_offset_t)&aim_early_init < DMAP_BASE_ADDRESS) { if (mfmsr() & PSL_DR) { __restartkernel_virtual(fdt, 0, ofentry, mdp, mdp_cookie, DMAP_BASE_ADDRESS, mfmsr()); } else { __restartkernel(fdt, 0, ofentry, mdp, mdp_cookie, DMAP_BASE_ADDRESS, mfmsr()); } } #endif /* Various very early CPU fix ups */ switch (mfpvr() >> 16) { /* * PowerPC 970 CPUs have a misfeature requested by Apple that * makes them pretend they have a 32-byte cacheline. Turn this * off before we measure the cacheline size. */ case IBM970: case IBM970FX: case IBM970MP: case IBM970GX: scratch = mfspr(SPR_HID5); scratch &= ~HID5_970_DCBZ_SIZE_HI; mtspr(SPR_HID5, scratch); break; #ifdef __powerpc64__ case IBMPOWER7: case IBMPOWER7PLUS: case IBMPOWER8: case IBMPOWER8E: case IBMPOWER8NVL: case IBMPOWER9: /* XXX: get from ibm,slb-size in device tree */ n_slbs = 32; break; #endif } } void aim_cpu_init(vm_offset_t toc) { size_t trap_offset, trapsize; vm_offset_t trap; register_t msr; uint8_t *cache_check; int cacheline_warn; #ifndef __powerpc64__ register_t scratch; int ppc64; #endif trap_offset = 0; cacheline_warn = 0; /* General setup for AIM CPUs */ psl_kernset = PSL_EE | PSL_ME | PSL_IR | PSL_DR | PSL_RI; #ifdef __powerpc64__ psl_kernset |= PSL_SF; if (mfmsr() & PSL_HV) psl_kernset |= PSL_HV; #endif psl_userset = psl_kernset | PSL_PR; #ifdef __powerpc64__ psl_userset32 = psl_userset & ~PSL_SF; #endif /* * Zeroed bits in this variable signify that the value of the bit * in its position is allowed to vary between userspace contexts. * * All other bits are required to be identical for every userspace * context. The actual *value* of the bit is determined by * psl_userset and/or psl_userset32, and is not allowed to change. * * Remember to update this set when implementing support for * *conditionally* enabling a processor facility. Failing to do * this will cause swapcontext() in userspace to break when a * process uses a conditionally-enabled facility. * * When *unconditionally* implementing support for a processor * facility, update psl_userset / psl_userset32 instead. * * See the access control check in set_mcontext(). */ psl_userstatic = ~(PSL_VSX | PSL_VEC | PSL_FP | PSL_FE0 | PSL_FE1); /* * Mask bits from the SRR1 that aren't really the MSR: * Bits 1-4, 10-15 (ppc32), 33-36, 42-47 (ppc64) */ psl_userstatic &= ~0x783f0000UL; /* * Initialize the interrupt tables and figure out our cache line * size and whether or not we need the 64-bit bridge code. */ /* * Disable translation in case the vector area hasn't been * mapped (G5). Note that no OFW calls can be made until * translation is re-enabled. */ msr = mfmsr(); mtmsr((msr & ~(PSL_IR | PSL_DR)) | PSL_RI); /* * Measure the cacheline size using dcbz * * Use EXC_PGM as a playground. We are about to overwrite it * anyway, we know it exists, and we know it is cache-aligned. */ cache_check = (void *)EXC_PGM; for (cacheline_size = 0; cacheline_size < 0x100; cacheline_size++) cache_check[cacheline_size] = 0xff; __asm __volatile("dcbz 0,%0":: "r" (cache_check) : "memory"); /* Find the first byte dcbz did not zero to get the cache line size */ for (cacheline_size = 0; cacheline_size < 0x100 && cache_check[cacheline_size] == 0; cacheline_size++); /* Work around psim bug */ if (cacheline_size == 0) { cacheline_warn = 1; cacheline_size = 32; } #ifndef __powerpc64__ /* * Figure out whether we need to use the 64 bit PMAP. This works by * executing an instruction that is only legal on 64-bit PPC (mtmsrd), * and setting ppc64 = 0 if that causes a trap. */ ppc64 = 1; bcopy(&testppc64, (void *)EXC_PGM, (size_t)&testppc64size); __syncicache((void *)EXC_PGM, (size_t)&testppc64size); __asm __volatile("\ mfmsr %0; \ mtsprg2 %1; \ \ mtmsrd %0; \ mfsprg2 %1;" : "=r"(scratch), "=r"(ppc64)); if (ppc64) cpu_features |= PPC_FEATURE_64; /* * Now copy restorebridge into all the handlers, if necessary, * and set up the trap tables. */ if (cpu_features & PPC_FEATURE_64) { /* Patch the two instances of rfi -> rfid */ bcopy(&rfid_patch,&rfi_patch1,4); #ifdef KDB /* rfi_patch2 is at the end of dbleave */ bcopy(&rfid_patch,&rfi_patch2,4); #endif } #else /* powerpc64 */ cpu_features |= PPC_FEATURE_64; #endif trapsize = (size_t)&trapcodeend - (size_t)&trapcode; /* * Copy generic handler into every possible trap. Special cases will get * different ones in a minute. */ for (trap = EXC_RST; trap < EXC_LAST; trap += 0x20) bcopy(&trapcode, (void *)trap, trapsize); #ifndef __powerpc64__ if (cpu_features & PPC_FEATURE_64) { /* * Copy a code snippet to restore 32-bit bridge mode * to the top of every non-generic trap handler */ trap_offset += (size_t)&restorebridgesize; bcopy(&restorebridge, (void *)EXC_RST, trap_offset); bcopy(&restorebridge, (void *)EXC_DSI, trap_offset); bcopy(&restorebridge, (void *)EXC_ALI, trap_offset); bcopy(&restorebridge, (void *)EXC_PGM, trap_offset); bcopy(&restorebridge, (void *)EXC_MCHK, trap_offset); bcopy(&restorebridge, (void *)EXC_TRC, trap_offset); bcopy(&restorebridge, (void *)EXC_BPT, trap_offset); } #else trapsize = (size_t)&hypertrapcodeend - (size_t)&hypertrapcode; bcopy(&hypertrapcode, (void *)(EXC_HEA + trap_offset), trapsize); bcopy(&hypertrapcode, (void *)(EXC_HMI + trap_offset), trapsize); bcopy(&hypertrapcode, (void *)(EXC_HVI + trap_offset), trapsize); bcopy(&hypertrapcode, (void *)(EXC_SOFT_PATCH + trap_offset), trapsize); #endif bcopy(&rstcode, (void *)(EXC_RST + trap_offset), (size_t)&rstcodeend - (size_t)&rstcode); #ifdef KDB bcopy(&dblow, (void *)(EXC_MCHK + trap_offset), (size_t)&dbend - (size_t)&dblow); bcopy(&dblow, (void *)(EXC_PGM + trap_offset), (size_t)&dbend - (size_t)&dblow); bcopy(&dblow, (void *)(EXC_TRC + trap_offset), (size_t)&dbend - (size_t)&dblow); bcopy(&dblow, (void *)(EXC_BPT + trap_offset), (size_t)&dbend - (size_t)&dblow); #endif bcopy(&alitrap, (void *)(EXC_ALI + trap_offset), (size_t)&aliend - (size_t)&alitrap); bcopy(&dsitrap, (void *)(EXC_DSI + trap_offset), (size_t)&dsiend - (size_t)&dsitrap); /* Set address of generictrap for self-reloc calculations */ *((void **)TRAP_GENTRAP) = &generictrap; #ifdef __powerpc64__ /* Set TOC base so that the interrupt code can get at it */ *((void **)TRAP_ENTRY) = &generictrap; *((register_t *)TRAP_TOCBASE) = toc; #else /* Set branch address for trap code */ if (cpu_features & PPC_FEATURE_64) *((void **)TRAP_ENTRY) = &generictrap64; else *((void **)TRAP_ENTRY) = &generictrap; *((void **)TRAP_TOCBASE) = _GLOBAL_OFFSET_TABLE_; /* G2-specific TLB miss helper handlers */ bcopy(&imisstrap, (void *)EXC_IMISS, (size_t)&imisssize); bcopy(&dlmisstrap, (void *)EXC_DLMISS, (size_t)&dlmisssize); bcopy(&dsmisstrap, (void *)EXC_DSMISS, (size_t)&dsmisssize); #endif __syncicache(EXC_RSVD, EXC_LAST - EXC_RSVD); /* * Restore MSR */ mtmsr(msr); /* Warn if cachline size was not determined */ if (cacheline_warn == 1) { printf("WARNING: cacheline size undetermined, setting to 32\n"); } /* * Initialise virtual memory. Use BUS_PROBE_GENERIC priority * in case the platform module had a better idea of what we * should do. */ - if (cpu_features & PPC_FEATURE_64) + if (cpu_features2 & PPC_FEATURE2_ARCH_3_00) { + radix_mmu = 0; + TUNABLE_INT_FETCH("radix_mmu", &radix_mmu); + if (radix_mmu) + pmap_mmu_install(MMU_TYPE_RADIX, BUS_PROBE_GENERIC); + else + pmap_mmu_install(MMU_TYPE_G5, BUS_PROBE_GENERIC); + } else if (cpu_features & PPC_FEATURE_64) pmap_mmu_install(MMU_TYPE_G5, BUS_PROBE_GENERIC); else pmap_mmu_install(MMU_TYPE_OEA, BUS_PROBE_GENERIC); } /* * Shutdown the CPU as much as possible. */ void cpu_halt(void) { OF_exit(); } int ptrace_single_step(struct thread *td) { struct trapframe *tf; tf = td->td_frame; tf->srr1 |= PSL_SE; return (0); } int ptrace_clear_single_step(struct thread *td) { struct trapframe *tf; tf = td->td_frame; tf->srr1 &= ~PSL_SE; return (0); } void kdb_cpu_clear_singlestep(void) { kdb_frame->srr1 &= ~PSL_SE; } void kdb_cpu_set_singlestep(void) { kdb_frame->srr1 |= PSL_SE; } /* * Initialise a struct pcpu. */ void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t sz) { #ifdef __powerpc64__ /* Copy the SLB contents from the current CPU */ memcpy(pcpu->pc_aim.slb, PCPU_GET(aim.slb), sizeof(pcpu->pc_aim.slb)); #endif } /* Return 0 on handled success, otherwise signal number. */ int cpu_machine_check(struct thread *td, struct trapframe *frame, int *ucode) { #ifdef __powerpc64__ /* * This block is 64-bit CPU specific currently. Punt running in 32-bit * mode on 64-bit CPUs. */ /* Check if the important information is in DSISR */ if ((frame->srr1 & SRR1_MCHK_DATA) != 0) { printf("Machine check, DSISR: %016lx\n", frame->cpu.aim.dsisr); /* SLB multi-hit is recoverable. */ if ((frame->cpu.aim.dsisr & DSISR_MC_SLB_MULTIHIT) != 0) return (0); /* TODO: Add other machine check recovery procedures. */ } else { if ((frame->srr1 & SRR1_MCHK_IFETCH_M) == SRR1_MCHK_IFETCH_SLBMH) return (0); } #endif *ucode = BUS_OBJERR; return (SIGBUS); } #ifndef __powerpc64__ uint64_t va_to_vsid(pmap_t pm, vm_offset_t va) { return ((pm->pm_sr[(uintptr_t)va >> ADDR_SR_SHFT]) & SR_VSID_MASK); } #endif /* * These functions need to provide addresses that both (a) work in real mode * (or whatever mode/circumstances the kernel is in in early boot (now)) and * (b) can still, in principle, work once the kernel is going. Because these * rely on existing mappings/real mode, unmap is a no-op. */ vm_offset_t pmap_early_io_map(vm_paddr_t pa, vm_size_t size) { KASSERT(!pmap_bootstrapped, ("Not available after PMAP started!")); /* * If we have the MMU up in early boot, assume it is 1:1. Otherwise, * try to get the address in a memory region compatible with the * direct map for efficiency later. */ if (mfmsr() & PSL_DR) return (pa); else return (DMAP_BASE_ADDRESS + pa); } void pmap_early_io_unmap(vm_offset_t va, vm_size_t size) { KASSERT(!pmap_bootstrapped, ("Not available after PMAP started!")); } /* From p3-53 of the MPC7450 RISC Microprocessor Family Reference Manual */ void flush_disable_caches(void) { register_t msr; register_t msscr0; register_t cache_reg; volatile uint32_t *memp; uint32_t temp; int i; int x; msr = mfmsr(); powerpc_sync(); mtmsr(msr & ~(PSL_EE | PSL_DR)); msscr0 = mfspr(SPR_MSSCR0); msscr0 &= ~MSSCR0_L2PFE; mtspr(SPR_MSSCR0, msscr0); powerpc_sync(); isync(); __asm__ __volatile__("dssall; sync"); powerpc_sync(); isync(); __asm__ __volatile__("dcbf 0,%0" :: "r"(0)); __asm__ __volatile__("dcbf 0,%0" :: "r"(0)); __asm__ __volatile__("dcbf 0,%0" :: "r"(0)); /* Lock the L1 Data cache. */ mtspr(SPR_LDSTCR, mfspr(SPR_LDSTCR) | 0xFF); powerpc_sync(); isync(); mtspr(SPR_LDSTCR, 0); /* * Perform this in two stages: Flush the cache starting in RAM, then do it * from ROM. */ memp = (volatile uint32_t *)0x00000000; for (i = 0; i < 128 * 1024; i++) { temp = *memp; __asm__ __volatile__("dcbf 0,%0" :: "r"(memp)); memp += 32/sizeof(*memp); } memp = (volatile uint32_t *)0xfff00000; x = 0xfe; for (; x != 0xff;) { mtspr(SPR_LDSTCR, x); for (i = 0; i < 128; i++) { temp = *memp; __asm__ __volatile__("dcbf 0,%0" :: "r"(memp)); memp += 32/sizeof(*memp); } x = ((x << 1) | 1) & 0xff; } mtspr(SPR_LDSTCR, 0); cache_reg = mfspr(SPR_L2CR); if (cache_reg & L2CR_L2E) { cache_reg &= ~(L2CR_L2IO_7450 | L2CR_L2DO_7450); mtspr(SPR_L2CR, cache_reg); powerpc_sync(); mtspr(SPR_L2CR, cache_reg | L2CR_L2HWF); while (mfspr(SPR_L2CR) & L2CR_L2HWF) ; /* Busy wait for cache to flush */ powerpc_sync(); cache_reg &= ~L2CR_L2E; mtspr(SPR_L2CR, cache_reg); powerpc_sync(); mtspr(SPR_L2CR, cache_reg | L2CR_L2I); powerpc_sync(); while (mfspr(SPR_L2CR) & L2CR_L2I) ; /* Busy wait for L2 cache invalidate */ powerpc_sync(); } cache_reg = mfspr(SPR_L3CR); if (cache_reg & L3CR_L3E) { cache_reg &= ~(L3CR_L3IO | L3CR_L3DO); mtspr(SPR_L3CR, cache_reg); powerpc_sync(); mtspr(SPR_L3CR, cache_reg | L3CR_L3HWF); while (mfspr(SPR_L3CR) & L3CR_L3HWF) ; /* Busy wait for cache to flush */ powerpc_sync(); cache_reg &= ~L3CR_L3E; mtspr(SPR_L3CR, cache_reg); powerpc_sync(); mtspr(SPR_L3CR, cache_reg | L3CR_L3I); powerpc_sync(); while (mfspr(SPR_L3CR) & L3CR_L3I) ; /* Busy wait for L3 cache invalidate */ powerpc_sync(); } mtspr(SPR_HID0, mfspr(SPR_HID0) & ~HID0_DCE); powerpc_sync(); isync(); mtmsr(msr); } void cpu_sleep() { static u_quad_t timebase = 0; static register_t sprgs[4]; static register_t srrs[2]; jmp_buf resetjb; struct thread *fputd; struct thread *vectd; register_t hid0; register_t msr; register_t saved_msr; ap_pcpu = pcpup; PCPU_SET(restore, &resetjb); saved_msr = mfmsr(); fputd = PCPU_GET(fputhread); vectd = PCPU_GET(vecthread); if (fputd != NULL) save_fpu(fputd); if (vectd != NULL) save_vec(vectd); if (setjmp(resetjb) == 0) { sprgs[0] = mfspr(SPR_SPRG0); sprgs[1] = mfspr(SPR_SPRG1); sprgs[2] = mfspr(SPR_SPRG2); sprgs[3] = mfspr(SPR_SPRG3); srrs[0] = mfspr(SPR_SRR0); srrs[1] = mfspr(SPR_SRR1); timebase = mftb(); powerpc_sync(); flush_disable_caches(); hid0 = mfspr(SPR_HID0); hid0 = (hid0 & ~(HID0_DOZE | HID0_NAP)) | HID0_SLEEP; powerpc_sync(); isync(); msr = mfmsr() | PSL_POW; mtspr(SPR_HID0, hid0); powerpc_sync(); while (1) mtmsr(msr); } platform_smp_timebase_sync(timebase, 0); PCPU_SET(curthread, curthread); PCPU_SET(curpcb, curthread->td_pcb); pmap_activate(curthread); powerpc_sync(); mtspr(SPR_SPRG0, sprgs[0]); mtspr(SPR_SPRG1, sprgs[1]); mtspr(SPR_SPRG2, sprgs[2]); mtspr(SPR_SPRG3, sprgs[3]); mtspr(SPR_SRR0, srrs[0]); mtspr(SPR_SRR1, srrs[1]); mtmsr(saved_msr); if (fputd == curthread) enable_fpu(curthread); if (vectd == curthread) enable_vec(curthread); powerpc_sync(); } Index: head/sys/powerpc/aim/mmu_oea.c =================================================================== --- head/sys/powerpc/aim/mmu_oea.c (revision 360886) +++ head/sys/powerpc/aim/mmu_oea.c (revision 360887) @@ -1,2771 +1,2779 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND BSD-4-Clause * * Copyright (c) 2001 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Matt Thomas of Allegro Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: pmap.c,v 1.28 2000/03/26 20:42:36 kleink Exp $ */ /*- * Copyright (C) 2001 Benno Rice. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is also stored by the * logical address mapping module, this module may throw away valid virtual * to physical mappings at almost any time. However, invalidations of * mappings must be done as requested. * * In order to cope with hardware architectures which make virtual to * physical map invalidates expensive, this module may delay invalidate * reduced protection operations until such time as they are actually * necessary. This module is given full information as to which processors * are currently using which maps, and to when physical maps must be made * correct. */ #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmu_if.h" #define MOEA_DEBUG #define TODO panic("%s: not implemented", __func__); #define VSID_MAKE(sr, hash) ((sr) | (((hash) & 0xfffff) << 4)) #define VSID_TO_SR(vsid) ((vsid) & 0xf) #define VSID_TO_HASH(vsid) (((vsid) >> 4) & 0xfffff) struct ofw_map { vm_offset_t om_va; vm_size_t om_len; vm_offset_t om_pa; u_int om_mode; }; extern unsigned char _etext[]; extern unsigned char _end[]; /* * Map of physical memory regions. */ static struct mem_region *regions; static struct mem_region *pregions; static u_int phys_avail_count; static int regions_sz, pregions_sz; static struct ofw_map *translations; /* * Lock for the pteg and pvo tables. */ struct mtx moea_table_mutex; struct mtx moea_vsid_mutex; /* tlbie instruction synchronization */ static struct mtx tlbie_mtx; /* * PTEG data. */ static struct pteg *moea_pteg_table; u_int moea_pteg_count; u_int moea_pteg_mask; /* * PVO data. */ struct pvo_head *moea_pvo_table; /* pvo entries by pteg index */ struct pvo_head moea_pvo_kunmanaged = LIST_HEAD_INITIALIZER(moea_pvo_kunmanaged); /* list of unmanaged pages */ static struct rwlock_padalign pvh_global_lock; uma_zone_t moea_upvo_zone; /* zone for pvo entries for unmanaged pages */ uma_zone_t moea_mpvo_zone; /* zone for pvo entries for managed pages */ #define BPVO_POOL_SIZE 32768 static struct pvo_entry *moea_bpvo_pool; static int moea_bpvo_pool_index = 0; #define VSID_NBPW (sizeof(u_int32_t) * 8) static u_int moea_vsid_bitmap[NPMAPS / VSID_NBPW]; static boolean_t moea_initialized = FALSE; /* * Statistics. */ u_int moea_pte_valid = 0; u_int moea_pte_overflow = 0; u_int moea_pte_replacements = 0; u_int moea_pvo_entries = 0; u_int moea_pvo_enter_calls = 0; u_int moea_pvo_remove_calls = 0; u_int moea_pte_spills = 0; SYSCTL_INT(_machdep, OID_AUTO, moea_pte_valid, CTLFLAG_RD, &moea_pte_valid, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea_pte_overflow, CTLFLAG_RD, &moea_pte_overflow, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea_pte_replacements, CTLFLAG_RD, &moea_pte_replacements, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_entries, CTLFLAG_RD, &moea_pvo_entries, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_enter_calls, CTLFLAG_RD, &moea_pvo_enter_calls, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_remove_calls, CTLFLAG_RD, &moea_pvo_remove_calls, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea_pte_spills, CTLFLAG_RD, &moea_pte_spills, 0, ""); /* * Allocate physical memory for use in moea_bootstrap. */ static vm_offset_t moea_bootstrap_alloc(vm_size_t, u_int); /* * PTE calls. */ static int moea_pte_insert(u_int, struct pte *); /* * PVO calls. */ static int moea_pvo_enter(pmap_t, uma_zone_t, struct pvo_head *, vm_offset_t, vm_paddr_t, u_int, int); static void moea_pvo_remove(struct pvo_entry *, int); static struct pvo_entry *moea_pvo_find_va(pmap_t, vm_offset_t, int *); static struct pte *moea_pvo_to_pte(const struct pvo_entry *, int); /* * Utility routines. */ static int moea_enter_locked(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t); static void moea_syncicache(vm_paddr_t, vm_size_t); static boolean_t moea_query_bit(vm_page_t, int); static u_int moea_clear_bit(vm_page_t, int); static void moea_kremove(mmu_t, vm_offset_t); int moea_pte_spill(vm_offset_t); /* * Kernel MMU interface */ void moea_clear_modify(mmu_t, vm_page_t); void moea_copy_page(mmu_t, vm_page_t, vm_page_t); void moea_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize); int moea_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t); void moea_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); void moea_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t); vm_paddr_t moea_extract(mmu_t, pmap_t, vm_offset_t); vm_page_t moea_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t); void moea_init(mmu_t); boolean_t moea_is_modified(mmu_t, vm_page_t); boolean_t moea_is_prefaultable(mmu_t, pmap_t, vm_offset_t); boolean_t moea_is_referenced(mmu_t, vm_page_t); int moea_ts_referenced(mmu_t, vm_page_t); vm_offset_t moea_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int); boolean_t moea_page_exists_quick(mmu_t, pmap_t, vm_page_t); void moea_page_init(mmu_t, vm_page_t); int moea_page_wired_mappings(mmu_t, vm_page_t); void moea_pinit(mmu_t, pmap_t); void moea_pinit0(mmu_t, pmap_t); void moea_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); void moea_qenter(mmu_t, vm_offset_t, vm_page_t *, int); void moea_qremove(mmu_t, vm_offset_t, int); void moea_release(mmu_t, pmap_t); void moea_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t); void moea_remove_all(mmu_t, vm_page_t); void moea_remove_write(mmu_t, vm_page_t); void moea_unwire(mmu_t, pmap_t, vm_offset_t, vm_offset_t); void moea_zero_page(mmu_t, vm_page_t); void moea_zero_page_area(mmu_t, vm_page_t, int, int); void moea_activate(mmu_t, struct thread *); void moea_deactivate(mmu_t, struct thread *); void moea_cpu_bootstrap(mmu_t, int); void moea_bootstrap(mmu_t, vm_offset_t, vm_offset_t); void *moea_mapdev(mmu_t, vm_paddr_t, vm_size_t); void *moea_mapdev_attr(mmu_t, vm_paddr_t, vm_size_t, vm_memattr_t); void moea_unmapdev(mmu_t, vm_offset_t, vm_size_t); vm_paddr_t moea_kextract(mmu_t, vm_offset_t); void moea_kenter_attr(mmu_t, vm_offset_t, vm_paddr_t, vm_memattr_t); void moea_kenter(mmu_t, vm_offset_t, vm_paddr_t); void moea_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma); boolean_t moea_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t); static void moea_sync_icache(mmu_t, pmap_t, vm_offset_t, vm_size_t); void moea_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va); void moea_scan_init(mmu_t mmu); vm_offset_t moea_quick_enter_page(mmu_t mmu, vm_page_t m); void moea_quick_remove_page(mmu_t mmu, vm_offset_t addr); +boolean_t moea_page_is_mapped(mmu_t mmu, vm_page_t m); static int moea_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen); static int moea_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr); static mmu_method_t moea_methods[] = { MMUMETHOD(mmu_clear_modify, moea_clear_modify), MMUMETHOD(mmu_copy_page, moea_copy_page), MMUMETHOD(mmu_copy_pages, moea_copy_pages), MMUMETHOD(mmu_enter, moea_enter), MMUMETHOD(mmu_enter_object, moea_enter_object), MMUMETHOD(mmu_enter_quick, moea_enter_quick), MMUMETHOD(mmu_extract, moea_extract), MMUMETHOD(mmu_extract_and_hold, moea_extract_and_hold), MMUMETHOD(mmu_init, moea_init), MMUMETHOD(mmu_is_modified, moea_is_modified), MMUMETHOD(mmu_is_prefaultable, moea_is_prefaultable), MMUMETHOD(mmu_is_referenced, moea_is_referenced), MMUMETHOD(mmu_ts_referenced, moea_ts_referenced), MMUMETHOD(mmu_map, moea_map), MMUMETHOD(mmu_page_exists_quick,moea_page_exists_quick), MMUMETHOD(mmu_page_init, moea_page_init), MMUMETHOD(mmu_page_wired_mappings,moea_page_wired_mappings), MMUMETHOD(mmu_pinit, moea_pinit), MMUMETHOD(mmu_pinit0, moea_pinit0), MMUMETHOD(mmu_protect, moea_protect), MMUMETHOD(mmu_qenter, moea_qenter), MMUMETHOD(mmu_qremove, moea_qremove), MMUMETHOD(mmu_release, moea_release), MMUMETHOD(mmu_remove, moea_remove), MMUMETHOD(mmu_remove_all, moea_remove_all), MMUMETHOD(mmu_remove_write, moea_remove_write), MMUMETHOD(mmu_sync_icache, moea_sync_icache), MMUMETHOD(mmu_unwire, moea_unwire), MMUMETHOD(mmu_zero_page, moea_zero_page), MMUMETHOD(mmu_zero_page_area, moea_zero_page_area), MMUMETHOD(mmu_activate, moea_activate), MMUMETHOD(mmu_deactivate, moea_deactivate), MMUMETHOD(mmu_page_set_memattr, moea_page_set_memattr), MMUMETHOD(mmu_quick_enter_page, moea_quick_enter_page), MMUMETHOD(mmu_quick_remove_page, moea_quick_remove_page), + MMUMETHOD(mmu_page_is_mapped, moea_page_is_mapped), /* Internal interfaces */ MMUMETHOD(mmu_bootstrap, moea_bootstrap), MMUMETHOD(mmu_cpu_bootstrap, moea_cpu_bootstrap), MMUMETHOD(mmu_mapdev_attr, moea_mapdev_attr), MMUMETHOD(mmu_mapdev, moea_mapdev), MMUMETHOD(mmu_unmapdev, moea_unmapdev), MMUMETHOD(mmu_kextract, moea_kextract), MMUMETHOD(mmu_kenter, moea_kenter), MMUMETHOD(mmu_kenter_attr, moea_kenter_attr), MMUMETHOD(mmu_dev_direct_mapped,moea_dev_direct_mapped), MMUMETHOD(mmu_scan_init, moea_scan_init), MMUMETHOD(mmu_dumpsys_map, moea_dumpsys_map), MMUMETHOD(mmu_map_user_ptr, moea_map_user_ptr), MMUMETHOD(mmu_decode_kernel_ptr, moea_decode_kernel_ptr), { 0, 0 } }; MMU_DEF(oea_mmu, MMU_TYPE_OEA, moea_methods, 0); static __inline uint32_t moea_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) { uint32_t pte_lo; int i; if (ma != VM_MEMATTR_DEFAULT) { switch (ma) { case VM_MEMATTR_UNCACHEABLE: return (PTE_I | PTE_G); case VM_MEMATTR_CACHEABLE: return (PTE_M); case VM_MEMATTR_WRITE_COMBINING: case VM_MEMATTR_WRITE_BACK: case VM_MEMATTR_PREFETCHABLE: return (PTE_I); case VM_MEMATTR_WRITE_THROUGH: return (PTE_W | PTE_M); } } /* * Assume the page is cache inhibited and access is guarded unless * it's in our available memory array. */ pte_lo = PTE_I | PTE_G; for (i = 0; i < pregions_sz; i++) { if ((pa >= pregions[i].mr_start) && (pa < (pregions[i].mr_start + pregions[i].mr_size))) { pte_lo = PTE_M; break; } } return pte_lo; } static void tlbie(vm_offset_t va) { mtx_lock_spin(&tlbie_mtx); __asm __volatile("ptesync"); __asm __volatile("tlbie %0" :: "r"(va)); __asm __volatile("eieio; tlbsync; ptesync"); mtx_unlock_spin(&tlbie_mtx); } static void tlbia(void) { vm_offset_t va; for (va = 0; va < 0x00040000; va += 0x00001000) { __asm __volatile("tlbie %0" :: "r"(va)); powerpc_sync(); } __asm __volatile("tlbsync"); powerpc_sync(); } static __inline int va_to_sr(u_int *sr, vm_offset_t va) { return (sr[(uintptr_t)va >> ADDR_SR_SHFT]); } static __inline u_int va_to_pteg(u_int sr, vm_offset_t addr) { u_int hash; hash = (sr & SR_VSID_MASK) ^ (((u_int)addr & ADDR_PIDX) >> ADDR_PIDX_SHFT); return (hash & moea_pteg_mask); } static __inline struct pvo_head * vm_page_to_pvoh(vm_page_t m) { return (&m->md.mdpg_pvoh); } static __inline void moea_attr_clear(vm_page_t m, int ptebit) { rw_assert(&pvh_global_lock, RA_WLOCKED); m->md.mdpg_attrs &= ~ptebit; } static __inline int moea_attr_fetch(vm_page_t m) { return (m->md.mdpg_attrs); } static __inline void moea_attr_save(vm_page_t m, int ptebit) { rw_assert(&pvh_global_lock, RA_WLOCKED); m->md.mdpg_attrs |= ptebit; } static __inline int moea_pte_compare(const struct pte *pt, const struct pte *pvo_pt) { if (pt->pte_hi == pvo_pt->pte_hi) return (1); return (0); } static __inline int moea_pte_match(struct pte *pt, u_int sr, vm_offset_t va, int which) { return (pt->pte_hi & ~PTE_VALID) == (((sr & SR_VSID_MASK) << PTE_VSID_SHFT) | ((va >> ADDR_API_SHFT) & PTE_API) | which); } static __inline void moea_pte_create(struct pte *pt, u_int sr, vm_offset_t va, u_int pte_lo) { mtx_assert(&moea_table_mutex, MA_OWNED); /* * Construct a PTE. Default to IMB initially. Valid bit only gets * set when the real pte is set in memory. * * Note: Don't set the valid bit for correct operation of tlb update. */ pt->pte_hi = ((sr & SR_VSID_MASK) << PTE_VSID_SHFT) | (((va & ADDR_PIDX) >> ADDR_API_SHFT) & PTE_API); pt->pte_lo = pte_lo; } static __inline void moea_pte_synch(struct pte *pt, struct pte *pvo_pt) { mtx_assert(&moea_table_mutex, MA_OWNED); pvo_pt->pte_lo |= pt->pte_lo & (PTE_REF | PTE_CHG); } static __inline void moea_pte_clear(struct pte *pt, vm_offset_t va, int ptebit) { mtx_assert(&moea_table_mutex, MA_OWNED); /* * As shown in Section 7.6.3.2.3 */ pt->pte_lo &= ~ptebit; tlbie(va); } static __inline void moea_pte_set(struct pte *pt, struct pte *pvo_pt) { mtx_assert(&moea_table_mutex, MA_OWNED); pvo_pt->pte_hi |= PTE_VALID; /* * Update the PTE as defined in section 7.6.3.1. * Note that the REF/CHG bits are from pvo_pt and thus should have * been saved so this routine can restore them (if desired). */ pt->pte_lo = pvo_pt->pte_lo; powerpc_sync(); pt->pte_hi = pvo_pt->pte_hi; powerpc_sync(); moea_pte_valid++; } static __inline void moea_pte_unset(struct pte *pt, struct pte *pvo_pt, vm_offset_t va) { mtx_assert(&moea_table_mutex, MA_OWNED); pvo_pt->pte_hi &= ~PTE_VALID; /* * Force the reg & chg bits back into the PTEs. */ powerpc_sync(); /* * Invalidate the pte. */ pt->pte_hi &= ~PTE_VALID; tlbie(va); /* * Save the reg & chg bits. */ moea_pte_synch(pt, pvo_pt); moea_pte_valid--; } static __inline void moea_pte_change(struct pte *pt, struct pte *pvo_pt, vm_offset_t va) { /* * Invalidate the PTE */ moea_pte_unset(pt, pvo_pt, va); moea_pte_set(pt, pvo_pt); } /* * Quick sort callout for comparing memory regions. */ static int om_cmp(const void *a, const void *b); static int om_cmp(const void *a, const void *b) { const struct ofw_map *mapa; const struct ofw_map *mapb; mapa = a; mapb = b; if (mapa->om_pa < mapb->om_pa) return (-1); else if (mapa->om_pa > mapb->om_pa) return (1); else return (0); } void moea_cpu_bootstrap(mmu_t mmup, int ap) { u_int sdr; int i; if (ap) { powerpc_sync(); __asm __volatile("mtdbatu 0,%0" :: "r"(battable[0].batu)); __asm __volatile("mtdbatl 0,%0" :: "r"(battable[0].batl)); isync(); __asm __volatile("mtibatu 0,%0" :: "r"(battable[0].batu)); __asm __volatile("mtibatl 0,%0" :: "r"(battable[0].batl)); isync(); } __asm __volatile("mtdbatu 1,%0" :: "r"(battable[8].batu)); __asm __volatile("mtdbatl 1,%0" :: "r"(battable[8].batl)); isync(); __asm __volatile("mtibatu 1,%0" :: "r"(0)); __asm __volatile("mtdbatu 2,%0" :: "r"(0)); __asm __volatile("mtibatu 2,%0" :: "r"(0)); __asm __volatile("mtdbatu 3,%0" :: "r"(0)); __asm __volatile("mtibatu 3,%0" :: "r"(0)); isync(); for (i = 0; i < 16; i++) mtsrin(i << ADDR_SR_SHFT, kernel_pmap->pm_sr[i]); powerpc_sync(); sdr = (u_int)moea_pteg_table | (moea_pteg_mask >> 10); __asm __volatile("mtsdr1 %0" :: "r"(sdr)); isync(); tlbia(); } void moea_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { ihandle_t mmui; phandle_t chosen, mmu; int sz; int i, j; vm_size_t size, physsz, hwphyssz; vm_offset_t pa, va, off; void *dpcpu; register_t msr; /* * Set up BAT0 to map the lowest 256 MB area */ battable[0x0].batl = BATL(0x00000000, BAT_M, BAT_PP_RW); battable[0x0].batu = BATU(0x00000000, BAT_BL_256M, BAT_Vs); /* * Map PCI memory space. */ battable[0x8].batl = BATL(0x80000000, BAT_I|BAT_G, BAT_PP_RW); battable[0x8].batu = BATU(0x80000000, BAT_BL_256M, BAT_Vs); battable[0x9].batl = BATL(0x90000000, BAT_I|BAT_G, BAT_PP_RW); battable[0x9].batu = BATU(0x90000000, BAT_BL_256M, BAT_Vs); battable[0xa].batl = BATL(0xa0000000, BAT_I|BAT_G, BAT_PP_RW); battable[0xa].batu = BATU(0xa0000000, BAT_BL_256M, BAT_Vs); battable[0xb].batl = BATL(0xb0000000, BAT_I|BAT_G, BAT_PP_RW); battable[0xb].batu = BATU(0xb0000000, BAT_BL_256M, BAT_Vs); /* * Map obio devices. */ battable[0xf].batl = BATL(0xf0000000, BAT_I|BAT_G, BAT_PP_RW); battable[0xf].batu = BATU(0xf0000000, BAT_BL_256M, BAT_Vs); /* * Use an IBAT and a DBAT to map the bottom segment of memory * where we are. Turn off instruction relocation temporarily * to prevent faults while reprogramming the IBAT. */ msr = mfmsr(); mtmsr(msr & ~PSL_IR); __asm (".balign 32; \n" "mtibatu 0,%0; mtibatl 0,%1; isync; \n" "mtdbatu 0,%0; mtdbatl 0,%1; isync" :: "r"(battable[0].batu), "r"(battable[0].batl)); mtmsr(msr); /* map pci space */ __asm __volatile("mtdbatu 1,%0" :: "r"(battable[8].batu)); __asm __volatile("mtdbatl 1,%0" :: "r"(battable[8].batl)); isync(); /* set global direct map flag */ hw_direct_map = 1; mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); CTR0(KTR_PMAP, "moea_bootstrap: physical memory"); for (i = 0; i < pregions_sz; i++) { vm_offset_t pa; vm_offset_t end; CTR3(KTR_PMAP, "physregion: %#x - %#x (%#x)", pregions[i].mr_start, pregions[i].mr_start + pregions[i].mr_size, pregions[i].mr_size); /* * Install entries into the BAT table to allow all * of physmem to be convered by on-demand BAT entries. * The loop will sometimes set the same battable element * twice, but that's fine since they won't be used for * a while yet. */ pa = pregions[i].mr_start & 0xf0000000; end = pregions[i].mr_start + pregions[i].mr_size; do { u_int n = pa >> ADDR_SR_SHFT; battable[n].batl = BATL(pa, BAT_M, BAT_PP_RW); battable[n].batu = BATU(pa, BAT_BL_256M, BAT_Vs); pa += SEGMENT_LENGTH; } while (pa < end); } if (PHYS_AVAIL_ENTRIES < regions_sz) panic("moea_bootstrap: phys_avail too small"); phys_avail_count = 0; physsz = 0; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); for (i = 0, j = 0; i < regions_sz; i++, j += 2) { CTR3(KTR_PMAP, "region: %#x - %#x (%#x)", regions[i].mr_start, regions[i].mr_start + regions[i].mr_size, regions[i].mr_size); if (hwphyssz != 0 && (physsz + regions[i].mr_size) >= hwphyssz) { if (physsz < hwphyssz) { phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + hwphyssz - physsz; physsz = hwphyssz; phys_avail_count++; } break; } phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; phys_avail_count++; physsz += regions[i].mr_size; } /* Check for overlap with the kernel and exception vectors */ for (j = 0; j < 2*phys_avail_count; j+=2) { if (phys_avail[j] < EXC_LAST) phys_avail[j] += EXC_LAST; if (kernelstart >= phys_avail[j] && kernelstart < phys_avail[j+1]) { if (kernelend < phys_avail[j+1]) { phys_avail[2*phys_avail_count] = (kernelend & ~PAGE_MASK) + PAGE_SIZE; phys_avail[2*phys_avail_count + 1] = phys_avail[j+1]; phys_avail_count++; } phys_avail[j+1] = kernelstart & ~PAGE_MASK; } if (kernelend >= phys_avail[j] && kernelend < phys_avail[j+1]) { if (kernelstart > phys_avail[j]) { phys_avail[2*phys_avail_count] = phys_avail[j]; phys_avail[2*phys_avail_count + 1] = kernelstart & ~PAGE_MASK; phys_avail_count++; } phys_avail[j] = (kernelend & ~PAGE_MASK) + PAGE_SIZE; } } physmem = btoc(physsz); /* * Allocate PTEG table. */ #ifdef PTEGCOUNT moea_pteg_count = PTEGCOUNT; #else moea_pteg_count = 0x1000; while (moea_pteg_count < physmem) moea_pteg_count <<= 1; moea_pteg_count >>= 1; #endif /* PTEGCOUNT */ size = moea_pteg_count * sizeof(struct pteg); CTR2(KTR_PMAP, "moea_bootstrap: %d PTEGs, %d bytes", moea_pteg_count, size); moea_pteg_table = (struct pteg *)moea_bootstrap_alloc(size, size); CTR1(KTR_PMAP, "moea_bootstrap: PTEG table at %p", moea_pteg_table); bzero((void *)moea_pteg_table, moea_pteg_count * sizeof(struct pteg)); moea_pteg_mask = moea_pteg_count - 1; /* * Allocate pv/overflow lists. */ size = sizeof(struct pvo_head) * moea_pteg_count; moea_pvo_table = (struct pvo_head *)moea_bootstrap_alloc(size, PAGE_SIZE); CTR1(KTR_PMAP, "moea_bootstrap: PVO table at %p", moea_pvo_table); for (i = 0; i < moea_pteg_count; i++) LIST_INIT(&moea_pvo_table[i]); /* * Initialize the lock that synchronizes access to the pteg and pvo * tables. */ mtx_init(&moea_table_mutex, "pmap table", NULL, MTX_DEF | MTX_RECURSE); mtx_init(&moea_vsid_mutex, "VSID table", NULL, MTX_DEF); mtx_init(&tlbie_mtx, "tlbie", NULL, MTX_SPIN); /* * Initialise the unmanaged pvo pool. */ moea_bpvo_pool = (struct pvo_entry *)moea_bootstrap_alloc( BPVO_POOL_SIZE*sizeof(struct pvo_entry), 0); moea_bpvo_pool_index = 0; /* * Make sure kernel vsid is allocated as well as VSID 0. */ moea_vsid_bitmap[(KERNEL_VSIDBITS & (NPMAPS - 1)) / VSID_NBPW] |= 1 << (KERNEL_VSIDBITS % VSID_NBPW); moea_vsid_bitmap[0] |= 1; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); for (i = 0; i < 16; i++) kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i; CPU_FILL(&kernel_pmap->pm_active); RB_INIT(&kernel_pmap->pmap_pvo); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); /* * Set up the Open Firmware mappings */ chosen = OF_finddevice("/chosen"); if (chosen != -1 && OF_getprop(chosen, "mmu", &mmui, 4) != -1 && (mmu = OF_instance_to_package(mmui)) != -1 && (sz = OF_getproplen(mmu, "translations")) != -1) { translations = NULL; for (i = 0; phys_avail[i] != 0; i += 2) { if (phys_avail[i + 1] >= sz) { translations = (struct ofw_map *)phys_avail[i]; break; } } if (translations == NULL) panic("moea_bootstrap: no space to copy translations"); bzero(translations, sz); if (OF_getprop(mmu, "translations", translations, sz) == -1) panic("moea_bootstrap: can't get ofw translations"); CTR0(KTR_PMAP, "moea_bootstrap: translations"); sz /= sizeof(*translations); qsort(translations, sz, sizeof (*translations), om_cmp); for (i = 0; i < sz; i++) { CTR3(KTR_PMAP, "translation: pa=%#x va=%#x len=%#x", translations[i].om_pa, translations[i].om_va, translations[i].om_len); /* * If the mapping is 1:1, let the RAM and device * on-demand BAT tables take care of the translation. */ if (translations[i].om_va == translations[i].om_pa) continue; /* Enter the pages */ for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) moea_kenter(mmup, translations[i].om_va + off, translations[i].om_pa + off); } } /* * Calculate the last available physical address. */ for (i = 0; phys_avail[i + 2] != 0; i += 2) ; Maxmem = powerpc_btop(phys_avail[i + 1]); moea_cpu_bootstrap(mmup,0); mtmsr(mfmsr() | PSL_DR | PSL_IR); pmap_bootstrapped++; /* * Set the start and end of kva. */ virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; /* * Allocate a kernel stack with a guard page for thread0 and map it * into the kernel page map. */ pa = moea_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE); va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; virtual_avail = va + kstack_pages * PAGE_SIZE; CTR2(KTR_PMAP, "moea_bootstrap: kstack0 at %#x (%#x)", pa, va); thread0.td_kstack = va; thread0.td_kstack_pages = kstack_pages; for (i = 0; i < kstack_pages; i++) { moea_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the message buffer. */ pa = msgbuf_phys = moea_bootstrap_alloc(msgbufsize, PAGE_SIZE); msgbufp = (struct msgbuf *)virtual_avail; va = virtual_avail; virtual_avail += round_page(msgbufsize); while (va < virtual_avail) { moea_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the dynamic percpu area. */ pa = moea_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE); dpcpu = (void *)virtual_avail; va = virtual_avail; virtual_avail += DPCPU_SIZE; while (va < virtual_avail) { moea_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } dpcpu_init(dpcpu, 0); } /* * Activate a user pmap. The pmap must be activated before it's address * space can be accessed in any way. */ void moea_activate(mmu_t mmu, struct thread *td) { pmap_t pm, pmr; /* * Load all the data we need up front to encourage the compiler to * not issue any loads while we have interrupts disabled below. */ pm = &td->td_proc->p_vmspace->vm_pmap; pmr = pm->pmap_phys; CPU_SET(PCPU_GET(cpuid), &pm->pm_active); PCPU_SET(curpmap, pmr); mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid); } void moea_deactivate(mmu_t mmu, struct thread *td) { pmap_t pm; pm = &td->td_proc->p_vmspace->vm_pmap; CPU_CLR(PCPU_GET(cpuid), &pm->pm_active); PCPU_SET(curpmap, NULL); } void moea_unwire(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry key, *pvo; PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { if ((pvo->pvo_vaddr & PVO_WIRED) == 0) panic("moea_unwire: pvo %p is missing PVO_WIRED", pvo); pvo->pvo_vaddr &= ~PVO_WIRED; pm->pm_stats.wired_count--; } PMAP_UNLOCK(pm); } void moea_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst) { vm_offset_t dst; vm_offset_t src; dst = VM_PAGE_TO_PHYS(mdst); src = VM_PAGE_TO_PHYS(msrc); bcopy((void *)src, (void *)dst, PAGE_SIZE); } void moea_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); a_cp = (char *)VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]) + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); b_cp = (char *)VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]) + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } /* * Zero a page of physical memory by temporarily mapping it into the tlb. */ void moea_zero_page(mmu_t mmu, vm_page_t m) { vm_offset_t off, pa = VM_PAGE_TO_PHYS(m); for (off = 0; off < PAGE_SIZE; off += cacheline_size) __asm __volatile("dcbz 0,%0" :: "r"(pa + off)); } void moea_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size) { vm_offset_t pa = VM_PAGE_TO_PHYS(m); void *va = (void *)(pa + off); bzero(va, size); } vm_offset_t moea_quick_enter_page(mmu_t mmu, vm_page_t m) { return (VM_PAGE_TO_PHYS(m)); } void moea_quick_remove_page(mmu_t mmu, vm_offset_t addr) { +} + +boolean_t +moea_page_is_mapped(mmu_t mmu, vm_page_t m) +{ + return (!LIST_EMPTY(&(m)->md.mdpg_pvoh)); } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. */ int moea_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { int error; for (;;) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); error = moea_enter_locked(pmap, va, m, prot, flags, psind); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); if (error != ENOMEM) return (KERN_SUCCESS); if ((flags & PMAP_ENTER_NOSLEEP) != 0) return (KERN_RESOURCE_SHORTAGE); VM_OBJECT_ASSERT_UNLOCKED(m->object); vm_wait(NULL); } } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. * * The global pvh and pmap must be locked. */ static int moea_enter_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind __unused) { struct pvo_head *pvo_head; uma_zone_t zone; u_int pte_lo, pvo_flags; int error; if (pmap_bootstrapped) rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((m->oflags & VPO_UNMANAGED) == 0) { if ((flags & PMAP_ENTER_QUICK_LOCKED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); else VM_OBJECT_ASSERT_LOCKED(m->object); } if ((m->oflags & VPO_UNMANAGED) != 0 || !moea_initialized) { pvo_head = &moea_pvo_kunmanaged; zone = moea_upvo_zone; pvo_flags = 0; } else { pvo_head = vm_page_to_pvoh(m); zone = moea_mpvo_zone; pvo_flags = PVO_MANAGED; } pte_lo = moea_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m)); if (prot & VM_PROT_WRITE) { pte_lo |= PTE_BW; if (pmap_bootstrapped && (m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); } else pte_lo |= PTE_BR; if ((flags & PMAP_ENTER_WIRED) != 0) pvo_flags |= PVO_WIRED; error = moea_pvo_enter(pmap, zone, pvo_head, va, VM_PAGE_TO_PHYS(m), pte_lo, pvo_flags); /* * Flush the real page from the instruction cache. This has be done * for all user mappings to prevent information leakage via the * instruction cache. moea_pvo_enter() returns ENOENT for the first * mapping for a page. */ if (pmap != kernel_pmap && error == ENOENT && (pte_lo & (PTE_I | PTE_G)) == 0) moea_syncicache(VM_PAGE_TO_PHYS(m), PAGE_SIZE); return (error); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void moea_enter_object(mmu_t mmu, pmap_t pm, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pm); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { moea_enter_locked(pm, start + ptoa(diff), m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_QUICK_LOCKED, 0); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pm); } void moea_enter_quick(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pm); moea_enter_locked(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_QUICK_LOCKED, 0); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pm); } vm_paddr_t moea_extract(mmu_t mmu, pmap_t pm, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; PMAP_LOCK(pm); pvo = moea_pvo_find_va(pm, va & ~ADDR_POFF, NULL); if (pvo == NULL) pa = 0; else pa = (pvo->pvo_pte.pte.pte_lo & PTE_RPGN) | (va & ADDR_POFF); PMAP_UNLOCK(pm); return (pa); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. */ vm_page_t moea_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot) { struct pvo_entry *pvo; vm_page_t m; m = NULL; PMAP_LOCK(pmap); pvo = moea_pvo_find_va(pmap, va & ~ADDR_POFF, NULL); if (pvo != NULL && (pvo->pvo_pte.pte.pte_hi & PTE_VALID) && ((pvo->pvo_pte.pte.pte_lo & PTE_PP) == PTE_RW || (prot & VM_PROT_WRITE) == 0)) { m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pte.pte_lo & PTE_RPGN); if (!vm_page_wire_mapped(m)) m = NULL; } PMAP_UNLOCK(pmap); return (m); } void moea_init(mmu_t mmu) { moea_upvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); moea_mpvo_zone = uma_zcreate("MPVO entry", sizeof(struct pvo_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); moea_initialized = TRUE; } boolean_t moea_is_referenced(mmu_t mmu, vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea_is_referenced: page %p is not managed", m)); rw_wlock(&pvh_global_lock); rv = moea_query_bit(m, PTE_REF); rw_wunlock(&pvh_global_lock); return (rv); } boolean_t moea_is_modified(mmu_t mmu, vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea_is_modified: page %p is not managed", m)); /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); rw_wlock(&pvh_global_lock); rv = moea_query_bit(m, PTE_CHG); rw_wunlock(&pvh_global_lock); return (rv); } boolean_t moea_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t va) { struct pvo_entry *pvo; boolean_t rv; PMAP_LOCK(pmap); pvo = moea_pvo_find_va(pmap, va & ~ADDR_POFF, NULL); rv = pvo == NULL || (pvo->pvo_pte.pte.pte_hi & PTE_VALID) == 0; PMAP_UNLOCK(pmap); return (rv); } void moea_clear_modify(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; rw_wlock(&pvh_global_lock); moea_clear_bit(m, PTE_CHG); rw_wunlock(&pvh_global_lock); } /* * Clear the write and modified bits in each of the given page's mappings. */ void moea_remove_write(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; struct pte *pt; pmap_t pmap; u_int lo; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; rw_wlock(&pvh_global_lock); lo = moea_attr_fetch(m); powerpc_sync(); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if ((pvo->pvo_pte.pte.pte_lo & PTE_PP) != PTE_BR) { pt = moea_pvo_to_pte(pvo, -1); pvo->pvo_pte.pte.pte_lo &= ~PTE_PP; pvo->pvo_pte.pte.pte_lo |= PTE_BR; if (pt != NULL) { moea_pte_synch(pt, &pvo->pvo_pte.pte); lo |= pvo->pvo_pte.pte.pte_lo; pvo->pvo_pte.pte.pte_lo &= ~PTE_CHG; moea_pte_change(pt, &pvo->pvo_pte.pte, pvo->pvo_vaddr); mtx_unlock(&moea_table_mutex); } } PMAP_UNLOCK(pmap); } if ((lo & PTE_CHG) != 0) { moea_attr_clear(m, PTE_CHG); vm_page_dirty(m); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); } /* * moea_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int moea_ts_referenced(mmu_t mmu, vm_page_t m) { int count; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea_ts_referenced: page %p is not managed", m)); rw_wlock(&pvh_global_lock); count = moea_clear_bit(m, PTE_REF); rw_wunlock(&pvh_global_lock); return (count); } /* * Modify the WIMG settings of all mappings for a page. */ void moea_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma) { struct pvo_entry *pvo; struct pvo_head *pvo_head; struct pte *pt; pmap_t pmap; u_int lo; if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; } rw_wlock(&pvh_global_lock); pvo_head = vm_page_to_pvoh(m); lo = moea_calc_wimg(VM_PAGE_TO_PHYS(m), ma); LIST_FOREACH(pvo, pvo_head, pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); pt = moea_pvo_to_pte(pvo, -1); pvo->pvo_pte.pte.pte_lo &= ~PTE_WIMG; pvo->pvo_pte.pte.pte_lo |= lo; if (pt != NULL) { moea_pte_change(pt, &pvo->pvo_pte.pte, pvo->pvo_vaddr); if (pvo->pvo_pmap == kernel_pmap) isync(); } mtx_unlock(&moea_table_mutex); PMAP_UNLOCK(pmap); } m->md.mdpg_cache_attrs = ma; rw_wunlock(&pvh_global_lock); } /* * Map a wired page into kernel virtual address space. */ void moea_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa) { moea_kenter_attr(mmu, va, pa, VM_MEMATTR_DEFAULT); } void moea_kenter_attr(mmu_t mmu, vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { u_int pte_lo; int error; #if 0 if (va < VM_MIN_KERNEL_ADDRESS) panic("moea_kenter: attempt to enter non-kernel address %#x", va); #endif pte_lo = moea_calc_wimg(pa, ma); PMAP_LOCK(kernel_pmap); error = moea_pvo_enter(kernel_pmap, moea_upvo_zone, &moea_pvo_kunmanaged, va, pa, pte_lo, PVO_WIRED); if (error != 0 && error != ENOENT) panic("moea_kenter: failed to enter va %#x pa %#x: %d", va, pa, error); PMAP_UNLOCK(kernel_pmap); } /* * Extract the physical page address associated with the given kernel virtual * address. */ vm_paddr_t moea_kextract(mmu_t mmu, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; /* * Allow direct mappings on 32-bit OEA */ if (va < VM_MIN_KERNEL_ADDRESS) { return (va); } PMAP_LOCK(kernel_pmap); pvo = moea_pvo_find_va(kernel_pmap, va & ~ADDR_POFF, NULL); KASSERT(pvo != NULL, ("moea_kextract: no addr found")); pa = (pvo->pvo_pte.pte.pte_lo & PTE_RPGN) | (va & ADDR_POFF); PMAP_UNLOCK(kernel_pmap); return (pa); } /* * Remove a wired page from kernel virtual address space. */ void moea_kremove(mmu_t mmu, vm_offset_t va) { moea_remove(mmu, kernel_pmap, va, va + PAGE_SIZE); } /* * Provide a kernel pointer corresponding to a given userland pointer. * The returned pointer is valid until the next time this function is * called in this thread. This is used internally in copyin/copyout. */ int moea_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen) { size_t l; register_t vsid; *kaddr = (char *)USER_ADDR + ((uintptr_t)uaddr & ~SEGMENT_MASK); l = ((char *)USER_ADDR + SEGMENT_LENGTH) - (char *)(*kaddr); if (l > ulen) l = ulen; if (klen) *klen = l; else if (l != ulen) return (EFAULT); vsid = va_to_vsid(pm, (vm_offset_t)uaddr); /* Mark segment no-execute */ vsid |= SR_N; /* If we have already set this VSID, we can just return */ if (curthread->td_pcb->pcb_cpu.aim.usr_vsid == vsid) return (0); __asm __volatile("isync"); curthread->td_pcb->pcb_cpu.aim.usr_segm = (uintptr_t)uaddr >> ADDR_SR_SHFT; curthread->td_pcb->pcb_cpu.aim.usr_vsid = vsid; __asm __volatile("mtsr %0,%1; isync" :: "n"(USER_SR), "r"(vsid)); return (0); } /* * Figure out where a given kernel pointer (usually in a fault) points * to from the VM's perspective, potentially remapping into userland's * address space. */ static int moea_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr) { vm_offset_t user_sr; if ((addr >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) { user_sr = curthread->td_pcb->pcb_cpu.aim.usr_segm; addr &= ADDR_PIDX | ADDR_POFF; addr |= user_sr << ADDR_SR_SHFT; *decoded_addr = addr; *is_user = 1; } else { *decoded_addr = addr; *is_user = 0; } return (0); } /* * Map a range of physical addresses into kernel virtual address space. * * The value passed in *virt is a suggested virtual address for the mapping. * Architectures which can support a direct-mapped physical to virtual region * can return the appropriate address within that region, leaving '*virt' * unchanged. We cannot and therefore do not; *virt is updated with the * first usable address after the mapped region. */ vm_offset_t moea_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start, vm_paddr_t pa_end, int prot) { vm_offset_t sva, va; sva = *virt; va = sva; for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE) moea_kenter(mmu, va, pa_start); *virt = va; return (sva); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t moea_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m) { int loops; struct pvo_entry *pvo; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea_page_exists_quick: page %p is not managed", m)); loops = 0; rv = FALSE; rw_wlock(&pvh_global_lock); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { if (pvo->pvo_pmap == pmap) { rv = TRUE; break; } if (++loops >= 16) break; } rw_wunlock(&pvh_global_lock); return (rv); } void moea_page_init(mmu_t mmu __unused, vm_page_t m) { m->md.mdpg_attrs = 0; m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; LIST_INIT(&m->md.mdpg_pvoh); } /* * Return the number of managed mappings to the given physical page * that are wired. */ int moea_page_wired_mappings(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) if ((pvo->pvo_vaddr & PVO_WIRED) != 0) count++; rw_wunlock(&pvh_global_lock); return (count); } static u_int moea_vsidcontext; void moea_pinit(mmu_t mmu, pmap_t pmap) { int i, mask; u_int entropy; KASSERT((int)pmap < VM_MIN_KERNEL_ADDRESS, ("moea_pinit: virt pmap")); RB_INIT(&pmap->pmap_pvo); entropy = 0; __asm __volatile("mftb %0" : "=r"(entropy)); if ((pmap->pmap_phys = (pmap_t)moea_kextract(mmu, (vm_offset_t)pmap)) == NULL) { pmap->pmap_phys = pmap; } mtx_lock(&moea_vsid_mutex); /* * Allocate some segment registers for this pmap. */ for (i = 0; i < NPMAPS; i += VSID_NBPW) { u_int hash, n; /* * Create a new value by mutiplying by a prime and adding in * entropy from the timebase register. This is to make the * VSID more random so that the PT hash function collides * less often. (Note that the prime casues gcc to do shifts * instead of a multiply.) */ moea_vsidcontext = (moea_vsidcontext * 0x1105) + entropy; hash = moea_vsidcontext & (NPMAPS - 1); if (hash == 0) /* 0 is special, avoid it */ continue; n = hash >> 5; mask = 1 << (hash & (VSID_NBPW - 1)); hash = (moea_vsidcontext & 0xfffff); if (moea_vsid_bitmap[n] & mask) { /* collision? */ /* anything free in this bucket? */ if (moea_vsid_bitmap[n] == 0xffffffff) { entropy = (moea_vsidcontext >> 20); continue; } i = ffs(~moea_vsid_bitmap[n]) - 1; mask = 1 << i; hash &= rounddown2(0xfffff, VSID_NBPW); hash |= i; } KASSERT(!(moea_vsid_bitmap[n] & mask), ("Allocating in-use VSID group %#x\n", hash)); moea_vsid_bitmap[n] |= mask; for (i = 0; i < 16; i++) pmap->pm_sr[i] = VSID_MAKE(i, hash); mtx_unlock(&moea_vsid_mutex); return; } mtx_unlock(&moea_vsid_mutex); panic("moea_pinit: out of segments"); } /* * Initialize the pmap associated with process 0. */ void moea_pinit0(mmu_t mmu, pmap_t pm) { PMAP_LOCK_INIT(pm); moea_pinit(mmu, pm); bzero(&pm->pm_stats, sizeof(pm->pm_stats)); } /* * Set the physical protection on the specified range of this map as requested. */ void moea_protect(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { struct pvo_entry *pvo, *tpvo, key; struct pte *pt; KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap, ("moea_protect: non current pmap")); if ((prot & VM_PROT_READ) == VM_PROT_NONE) { moea_remove(mmu, pm, sva, eva); return; } rw_wlock(&pvh_global_lock); PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); /* * Grab the PTE pointer before we diddle with the cached PTE * copy. */ pt = moea_pvo_to_pte(pvo, -1); /* * Change the protection of the page. */ pvo->pvo_pte.pte.pte_lo &= ~PTE_PP; pvo->pvo_pte.pte.pte_lo |= PTE_BR; /* * If the PVO is in the page table, update that pte as well. */ if (pt != NULL) { moea_pte_change(pt, &pvo->pvo_pte.pte, pvo->pvo_vaddr); mtx_unlock(&moea_table_mutex); } } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pm); } /* * Map a list of wired pages into kernel virtual address space. This is * intended for temporary mappings which do not need page modification or * references recorded. Existing mappings in the region are overwritten. */ void moea_qenter(mmu_t mmu, vm_offset_t sva, vm_page_t *m, int count) { vm_offset_t va; va = sva; while (count-- > 0) { moea_kenter(mmu, va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } } /* * Remove page mappings from kernel virtual address space. Intended for * temporary mappings entered by moea_qenter. */ void moea_qremove(mmu_t mmu, vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { moea_kremove(mmu, va); va += PAGE_SIZE; } } void moea_release(mmu_t mmu, pmap_t pmap) { int idx, mask; /* * Free segment register's VSID */ if (pmap->pm_sr[0] == 0) panic("moea_release"); mtx_lock(&moea_vsid_mutex); idx = VSID_TO_HASH(pmap->pm_sr[0]) & (NPMAPS-1); mask = 1 << (idx % VSID_NBPW); idx /= VSID_NBPW; moea_vsid_bitmap[idx] &= ~mask; mtx_unlock(&moea_vsid_mutex); } /* * Remove the given range of addresses from the specified map. */ void moea_remove(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry *pvo, *tpvo, key; rw_wlock(&pvh_global_lock); PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); moea_pvo_remove(pvo, -1); } PMAP_UNLOCK(pm); rw_wunlock(&pvh_global_lock); } /* * Remove physical page from all pmaps in which it resides. moea_pvo_remove() * will reflect changes in pte's back to the vm_page. */ void moea_remove_all(mmu_t mmu, vm_page_t m) { struct pvo_head *pvo_head; struct pvo_entry *pvo, *next_pvo; pmap_t pmap; rw_wlock(&pvh_global_lock); pvo_head = vm_page_to_pvoh(m); for (pvo = LIST_FIRST(pvo_head); pvo != NULL; pvo = next_pvo) { next_pvo = LIST_NEXT(pvo, pvo_vlink); pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); moea_pvo_remove(pvo, -1); PMAP_UNLOCK(pmap); } if ((m->a.flags & PGA_WRITEABLE) && moea_query_bit(m, PTE_CHG)) { moea_attr_clear(m, PTE_CHG); vm_page_dirty(m); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); } /* * Allocate a physical page of memory directly from the phys_avail map. * Can only be called from moea_bootstrap before avail start and end are * calculated. */ static vm_offset_t moea_bootstrap_alloc(vm_size_t size, u_int align) { vm_offset_t s, e; int i, j; size = round_page(size); for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (align != 0) s = roundup2(phys_avail[i], align); else s = phys_avail[i]; e = s + size; if (s < phys_avail[i] || e > phys_avail[i + 1]) continue; if (s == phys_avail[i]) { phys_avail[i] += size; } else if (e == phys_avail[i + 1]) { phys_avail[i + 1] -= size; } else { for (j = phys_avail_count * 2; j > i; j -= 2) { phys_avail[j] = phys_avail[j - 2]; phys_avail[j + 1] = phys_avail[j - 1]; } phys_avail[i + 3] = phys_avail[i + 1]; phys_avail[i + 1] = s; phys_avail[i + 2] = e; phys_avail_count++; } return (s); } panic("moea_bootstrap_alloc: could not allocate memory"); } static void moea_syncicache(vm_paddr_t pa, vm_size_t len) { __syncicache((void *)pa, len); } static int moea_pvo_enter(pmap_t pm, uma_zone_t zone, struct pvo_head *pvo_head, vm_offset_t va, vm_paddr_t pa, u_int pte_lo, int flags) { struct pvo_entry *pvo; u_int sr; int first; u_int ptegidx; int i; int bootstrap; moea_pvo_enter_calls++; first = 0; bootstrap = 0; /* * Compute the PTE Group index. */ va &= ~ADDR_POFF; sr = va_to_sr(pm->pm_sr, va); ptegidx = va_to_pteg(sr, va); /* * Remove any existing mapping for this page. Reuse the pvo entry if * there is a mapping. */ mtx_lock(&moea_table_mutex); LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) { if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) { if ((pvo->pvo_pte.pte.pte_lo & PTE_RPGN) == pa && (pvo->pvo_pte.pte.pte_lo & PTE_PP) == (pte_lo & PTE_PP)) { /* * The PTE is not changing. Instead, this may * be a request to change the mapping's wired * attribute. */ mtx_unlock(&moea_table_mutex); if ((flags & PVO_WIRED) != 0 && (pvo->pvo_vaddr & PVO_WIRED) == 0) { pvo->pvo_vaddr |= PVO_WIRED; pm->pm_stats.wired_count++; } else if ((flags & PVO_WIRED) == 0 && (pvo->pvo_vaddr & PVO_WIRED) != 0) { pvo->pvo_vaddr &= ~PVO_WIRED; pm->pm_stats.wired_count--; } return (0); } moea_pvo_remove(pvo, -1); break; } } /* * If we aren't overwriting a mapping, try to allocate. */ if (moea_initialized) { pvo = uma_zalloc(zone, M_NOWAIT); } else { if (moea_bpvo_pool_index >= BPVO_POOL_SIZE) { panic("moea_enter: bpvo pool exhausted, %d, %d, %d", moea_bpvo_pool_index, BPVO_POOL_SIZE, BPVO_POOL_SIZE * sizeof(struct pvo_entry)); } pvo = &moea_bpvo_pool[moea_bpvo_pool_index]; moea_bpvo_pool_index++; bootstrap = 1; } if (pvo == NULL) { mtx_unlock(&moea_table_mutex); return (ENOMEM); } moea_pvo_entries++; pvo->pvo_vaddr = va; pvo->pvo_pmap = pm; LIST_INSERT_HEAD(&moea_pvo_table[ptegidx], pvo, pvo_olink); pvo->pvo_vaddr &= ~ADDR_POFF; if (flags & PVO_WIRED) pvo->pvo_vaddr |= PVO_WIRED; if (pvo_head != &moea_pvo_kunmanaged) pvo->pvo_vaddr |= PVO_MANAGED; if (bootstrap) pvo->pvo_vaddr |= PVO_BOOTSTRAP; moea_pte_create(&pvo->pvo_pte.pte, sr, va, pa | pte_lo); /* * Add to pmap list */ RB_INSERT(pvo_tree, &pm->pmap_pvo, pvo); /* * Remember if the list was empty and therefore will be the first * item. */ if (LIST_FIRST(pvo_head) == NULL) first = 1; LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink); if (pvo->pvo_vaddr & PVO_WIRED) pm->pm_stats.wired_count++; pm->pm_stats.resident_count++; i = moea_pte_insert(ptegidx, &pvo->pvo_pte.pte); KASSERT(i < 8, ("Invalid PTE index")); if (i >= 0) { PVO_PTEGIDX_SET(pvo, i); } else { panic("moea_pvo_enter: overflow"); moea_pte_overflow++; } mtx_unlock(&moea_table_mutex); return (first ? ENOENT : 0); } static void moea_pvo_remove(struct pvo_entry *pvo, int pteidx) { struct pte *pt; /* * If there is an active pte entry, we need to deactivate it (and * save the ref & cfg bits). */ pt = moea_pvo_to_pte(pvo, pteidx); if (pt != NULL) { moea_pte_unset(pt, &pvo->pvo_pte.pte, pvo->pvo_vaddr); mtx_unlock(&moea_table_mutex); PVO_PTEGIDX_CLR(pvo); } else { moea_pte_overflow--; } /* * Update our statistics. */ pvo->pvo_pmap->pm_stats.resident_count--; if (pvo->pvo_vaddr & PVO_WIRED) pvo->pvo_pmap->pm_stats.wired_count--; /* * Remove this PVO from the PV and pmap lists. */ LIST_REMOVE(pvo, pvo_vlink); RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); /* * Save the REF/CHG bits into their cache if the page is managed. * Clear PGA_WRITEABLE if all mappings of the page have been removed. */ if ((pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED) { struct vm_page *pg; pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pte.pte_lo & PTE_RPGN); if (pg != NULL) { moea_attr_save(pg, pvo->pvo_pte.pte.pte_lo & (PTE_REF | PTE_CHG)); if (LIST_EMPTY(&pg->md.mdpg_pvoh)) vm_page_aflag_clear(pg, PGA_WRITEABLE); } } /* * Remove this from the overflow list and return it to the pool * if we aren't going to reuse it. */ LIST_REMOVE(pvo, pvo_olink); if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP)) uma_zfree(pvo->pvo_vaddr & PVO_MANAGED ? moea_mpvo_zone : moea_upvo_zone, pvo); moea_pvo_entries--; moea_pvo_remove_calls++; } static __inline int moea_pvo_pte_index(const struct pvo_entry *pvo, int ptegidx) { int pteidx; /* * We can find the actual pte entry without searching by grabbing * the PTEG index from 3 unused bits in pte_lo[11:9] and by * noticing the HID bit. */ pteidx = ptegidx * 8 + PVO_PTEGIDX_GET(pvo); if (pvo->pvo_pte.pte.pte_hi & PTE_HID) pteidx ^= moea_pteg_mask * 8; return (pteidx); } static struct pvo_entry * moea_pvo_find_va(pmap_t pm, vm_offset_t va, int *pteidx_p) { struct pvo_entry *pvo; int ptegidx; u_int sr; va &= ~ADDR_POFF; sr = va_to_sr(pm->pm_sr, va); ptegidx = va_to_pteg(sr, va); mtx_lock(&moea_table_mutex); LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) { if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) { if (pteidx_p) *pteidx_p = moea_pvo_pte_index(pvo, ptegidx); break; } } mtx_unlock(&moea_table_mutex); return (pvo); } static struct pte * moea_pvo_to_pte(const struct pvo_entry *pvo, int pteidx) { struct pte *pt; /* * If we haven't been supplied the ptegidx, calculate it. */ if (pteidx == -1) { int ptegidx; u_int sr; sr = va_to_sr(pvo->pvo_pmap->pm_sr, pvo->pvo_vaddr); ptegidx = va_to_pteg(sr, pvo->pvo_vaddr); pteidx = moea_pvo_pte_index(pvo, ptegidx); } pt = &moea_pteg_table[pteidx >> 3].pt[pteidx & 7]; mtx_lock(&moea_table_mutex); if ((pvo->pvo_pte.pte.pte_hi & PTE_VALID) && !PVO_PTEGIDX_ISSET(pvo)) { panic("moea_pvo_to_pte: pvo %p has valid pte in pvo but no " "valid pte index", pvo); } if ((pvo->pvo_pte.pte.pte_hi & PTE_VALID) == 0 && PVO_PTEGIDX_ISSET(pvo)) { panic("moea_pvo_to_pte: pvo %p has valid pte index in pvo " "pvo but no valid pte", pvo); } if ((pt->pte_hi ^ (pvo->pvo_pte.pte.pte_hi & ~PTE_VALID)) == PTE_VALID) { if ((pvo->pvo_pte.pte.pte_hi & PTE_VALID) == 0) { panic("moea_pvo_to_pte: pvo %p has valid pte in " "moea_pteg_table %p but invalid in pvo", pvo, pt); } if (((pt->pte_lo ^ pvo->pvo_pte.pte.pte_lo) & ~(PTE_CHG|PTE_REF)) != 0) { panic("moea_pvo_to_pte: pvo %p pte does not match " "pte %p in moea_pteg_table", pvo, pt); } mtx_assert(&moea_table_mutex, MA_OWNED); return (pt); } if (pvo->pvo_pte.pte.pte_hi & PTE_VALID) { panic("moea_pvo_to_pte: pvo %p has invalid pte %p in " "moea_pteg_table but valid in pvo: %8x, %8x", pvo, pt, pvo->pvo_pte.pte.pte_hi, pt->pte_hi); } mtx_unlock(&moea_table_mutex); return (NULL); } /* * XXX: THIS STUFF SHOULD BE IN pte.c? */ int moea_pte_spill(vm_offset_t addr) { struct pvo_entry *source_pvo, *victim_pvo; struct pvo_entry *pvo; int ptegidx, i, j; u_int sr; struct pteg *pteg; struct pte *pt; moea_pte_spills++; sr = mfsrin(addr); ptegidx = va_to_pteg(sr, addr); /* * Have to substitute some entry. Use the primary hash for this. * Use low bits of timebase as random generator. */ pteg = &moea_pteg_table[ptegidx]; mtx_lock(&moea_table_mutex); __asm __volatile("mftb %0" : "=r"(i)); i &= 7; pt = &pteg->pt[i]; source_pvo = NULL; victim_pvo = NULL; LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) { /* * We need to find a pvo entry for this address. */ if (source_pvo == NULL && moea_pte_match(&pvo->pvo_pte.pte, sr, addr, pvo->pvo_pte.pte.pte_hi & PTE_HID)) { /* * Now found an entry to be spilled into the pteg. * The PTE is now valid, so we know it's active. */ j = moea_pte_insert(ptegidx, &pvo->pvo_pte.pte); if (j >= 0) { PVO_PTEGIDX_SET(pvo, j); moea_pte_overflow--; mtx_unlock(&moea_table_mutex); return (1); } source_pvo = pvo; if (victim_pvo != NULL) break; } /* * We also need the pvo entry of the victim we are replacing * so save the R & C bits of the PTE. */ if ((pt->pte_hi & PTE_HID) == 0 && victim_pvo == NULL && moea_pte_compare(pt, &pvo->pvo_pte.pte)) { victim_pvo = pvo; if (source_pvo != NULL) break; } } if (source_pvo == NULL) { mtx_unlock(&moea_table_mutex); return (0); } if (victim_pvo == NULL) { if ((pt->pte_hi & PTE_HID) == 0) panic("moea_pte_spill: victim p-pte (%p) has no pvo" "entry", pt); /* * If this is a secondary PTE, we need to search it's primary * pvo bucket for the matching PVO. */ LIST_FOREACH(pvo, &moea_pvo_table[ptegidx ^ moea_pteg_mask], pvo_olink) { /* * We also need the pvo entry of the victim we are * replacing so save the R & C bits of the PTE. */ if (moea_pte_compare(pt, &pvo->pvo_pte.pte)) { victim_pvo = pvo; break; } } if (victim_pvo == NULL) panic("moea_pte_spill: victim s-pte (%p) has no pvo" "entry", pt); } /* * We are invalidating the TLB entry for the EA we are replacing even * though it's valid. If we don't, we lose any ref/chg bit changes * contained in the TLB entry. */ source_pvo->pvo_pte.pte.pte_hi &= ~PTE_HID; moea_pte_unset(pt, &victim_pvo->pvo_pte.pte, victim_pvo->pvo_vaddr); moea_pte_set(pt, &source_pvo->pvo_pte.pte); PVO_PTEGIDX_CLR(victim_pvo); PVO_PTEGIDX_SET(source_pvo, i); moea_pte_replacements++; mtx_unlock(&moea_table_mutex); return (1); } static __inline struct pvo_entry * moea_pte_spillable_ident(u_int ptegidx) { struct pte *pt; struct pvo_entry *pvo_walk, *pvo = NULL; LIST_FOREACH(pvo_walk, &moea_pvo_table[ptegidx], pvo_olink) { if (pvo_walk->pvo_vaddr & PVO_WIRED) continue; if (!(pvo_walk->pvo_pte.pte.pte_hi & PTE_VALID)) continue; pt = moea_pvo_to_pte(pvo_walk, -1); if (pt == NULL) continue; pvo = pvo_walk; mtx_unlock(&moea_table_mutex); if (!(pt->pte_lo & PTE_REF)) return (pvo_walk); } return (pvo); } static int moea_pte_insert(u_int ptegidx, struct pte *pvo_pt) { struct pte *pt; struct pvo_entry *victim_pvo; int i; int victim_idx; u_int pteg_bkpidx = ptegidx; mtx_assert(&moea_table_mutex, MA_OWNED); /* * First try primary hash. */ for (pt = moea_pteg_table[ptegidx].pt, i = 0; i < 8; i++, pt++) { if ((pt->pte_hi & PTE_VALID) == 0) { pvo_pt->pte_hi &= ~PTE_HID; moea_pte_set(pt, pvo_pt); return (i); } } /* * Now try secondary hash. */ ptegidx ^= moea_pteg_mask; for (pt = moea_pteg_table[ptegidx].pt, i = 0; i < 8; i++, pt++) { if ((pt->pte_hi & PTE_VALID) == 0) { pvo_pt->pte_hi |= PTE_HID; moea_pte_set(pt, pvo_pt); return (i); } } /* Try again, but this time try to force a PTE out. */ ptegidx = pteg_bkpidx; victim_pvo = moea_pte_spillable_ident(ptegidx); if (victim_pvo == NULL) { ptegidx ^= moea_pteg_mask; victim_pvo = moea_pte_spillable_ident(ptegidx); } if (victim_pvo == NULL) { panic("moea_pte_insert: overflow"); return (-1); } victim_idx = moea_pvo_pte_index(victim_pvo, ptegidx); if (pteg_bkpidx == ptegidx) pvo_pt->pte_hi &= ~PTE_HID; else pvo_pt->pte_hi |= PTE_HID; /* * Synchronize the sacrifice PTE with its PVO, then mark both * invalid. The PVO will be reused when/if the VM system comes * here after a fault. */ pt = &moea_pteg_table[victim_idx >> 3].pt[victim_idx & 7]; if (pt->pte_hi != victim_pvo->pvo_pte.pte.pte_hi) panic("Victim PVO doesn't match PTE! PVO: %8x, PTE: %8x", victim_pvo->pvo_pte.pte.pte_hi, pt->pte_hi); /* * Set the new PTE. */ moea_pte_unset(pt, &victim_pvo->pvo_pte.pte, victim_pvo->pvo_vaddr); PVO_PTEGIDX_CLR(victim_pvo); moea_pte_overflow++; moea_pte_set(pt, pvo_pt); return (victim_idx & 7); } static boolean_t moea_query_bit(vm_page_t m, int ptebit) { struct pvo_entry *pvo; struct pte *pt; rw_assert(&pvh_global_lock, RA_WLOCKED); if (moea_attr_fetch(m) & ptebit) return (TRUE); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { /* * See if we saved the bit off. If so, cache it and return * success. */ if (pvo->pvo_pte.pte.pte_lo & ptebit) { moea_attr_save(m, ptebit); return (TRUE); } } /* * No luck, now go through the hard part of looking at the PTEs * themselves. Sync so that any pending REF/CHG bits are flushed to * the PTEs. */ powerpc_sync(); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { /* * See if this pvo has a valid PTE. if so, fetch the * REF/CHG bits from the valid PTE. If the appropriate * ptebit is set, cache it and return success. */ pt = moea_pvo_to_pte(pvo, -1); if (pt != NULL) { moea_pte_synch(pt, &pvo->pvo_pte.pte); mtx_unlock(&moea_table_mutex); if (pvo->pvo_pte.pte.pte_lo & ptebit) { moea_attr_save(m, ptebit); return (TRUE); } } } return (FALSE); } static u_int moea_clear_bit(vm_page_t m, int ptebit) { u_int count; struct pvo_entry *pvo; struct pte *pt; rw_assert(&pvh_global_lock, RA_WLOCKED); /* * Clear the cached value. */ moea_attr_clear(m, ptebit); /* * Sync so that any pending REF/CHG bits are flushed to the PTEs (so * we can reset the right ones). note that since the pvo entries and * list heads are accessed via BAT0 and are never placed in the page * table, we don't have to worry about further accesses setting the * REF/CHG bits. */ powerpc_sync(); /* * For each pvo entry, clear the pvo's ptebit. If this pvo has a * valid pte clear the ptebit from the valid pte. */ count = 0; LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pt = moea_pvo_to_pte(pvo, -1); if (pt != NULL) { moea_pte_synch(pt, &pvo->pvo_pte.pte); if (pvo->pvo_pte.pte.pte_lo & ptebit) { count++; moea_pte_clear(pt, PVO_VADDR(pvo), ptebit); } mtx_unlock(&moea_table_mutex); } pvo->pvo_pte.pte.pte_lo &= ~ptebit; } return (count); } /* * Return true if the physical range is encompassed by the battable[idx] */ static int moea_bat_mapped(int idx, vm_paddr_t pa, vm_size_t size) { u_int prot; u_int32_t start; u_int32_t end; u_int32_t bat_ble; /* * Return immediately if not a valid mapping */ if (!(battable[idx].batu & BAT_Vs)) return (EINVAL); /* * The BAT entry must be cache-inhibited, guarded, and r/w * so it can function as an i/o page */ prot = battable[idx].batl & (BAT_I|BAT_G|BAT_PP_RW); if (prot != (BAT_I|BAT_G|BAT_PP_RW)) return (EPERM); /* * The address should be within the BAT range. Assume that the * start address in the BAT has the correct alignment (thus * not requiring masking) */ start = battable[idx].batl & BAT_PBS; bat_ble = (battable[idx].batu & ~(BAT_EBS)) | 0x03; end = start | (bat_ble << 15) | 0x7fff; if ((pa < start) || ((pa + size) > end)) return (ERANGE); return (0); } boolean_t moea_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { int i; /* * This currently does not work for entries that * overlap 256M BAT segments. */ for(i = 0; i < 16; i++) if (moea_bat_mapped(i, pa, size) == 0) return (0); return (EFAULT); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * moea_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { return (moea_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT)); } void * moea_mapdev_attr(mmu_t mmu, vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { vm_offset_t va, tmpva, ppa, offset; int i; ppa = trunc_page(pa); offset = pa & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); /* * If the physical address lies within a valid BAT table entry, * return the 1:1 mapping. This currently doesn't work * for regions that overlap 256M BAT segments. */ for (i = 0; i < 16; i++) { if (moea_bat_mapped(i, pa, size) == 0) return ((void *) pa); } va = kva_alloc(size); if (!va) panic("moea_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0;) { moea_kenter_attr(mmu, tmpva, ppa, ma); tlbie(tmpva); size -= PAGE_SIZE; tmpva += PAGE_SIZE; ppa += PAGE_SIZE; } return ((void *)(va + offset)); } void moea_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size) { vm_offset_t base, offset; /* * If this is outside kernel virtual space, then it's a * battable entry and doesn't require unmapping */ if ((va >= VM_MIN_KERNEL_ADDRESS) && (va <= virtual_end)) { base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); kva_free(base, size); } } static void moea_sync_icache(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_size_t sz) { struct pvo_entry *pvo; vm_offset_t lim; vm_paddr_t pa; vm_size_t len; PMAP_LOCK(pm); while (sz > 0) { lim = round_page(va + 1); len = MIN(lim - va, sz); pvo = moea_pvo_find_va(pm, va & ~ADDR_POFF, NULL); if (pvo != NULL) { pa = (pvo->pvo_pte.pte.pte_lo & PTE_RPGN) | (va & ADDR_POFF); moea_syncicache(pa, len); } va += len; sz -= len; } PMAP_UNLOCK(pm); } void moea_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va) { *va = (void *)pa; } extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1]; void moea_scan_init(mmu_t mmu) { struct pvo_entry *pvo; vm_offset_t va; int i; if (!do_minidump) { /* Initialize phys. segments for dumpsys(). */ memset(&dump_map, 0, sizeof(dump_map)); mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); for (i = 0; i < pregions_sz; i++) { dump_map[i].pa_start = pregions[i].mr_start; dump_map[i].pa_size = pregions[i].mr_size; } return; } /* Virtual segments for minidumps: */ memset(&dump_map, 0, sizeof(dump_map)); /* 1st: kernel .data and .bss. */ dump_map[0].pa_start = trunc_page((uintptr_t)_etext); dump_map[0].pa_size = round_page((uintptr_t)_end) - dump_map[0].pa_start; /* 2nd: msgbuf and tables (see pmap_bootstrap()). */ dump_map[1].pa_start = (vm_paddr_t)msgbufp->msg_ptr; dump_map[1].pa_size = round_page(msgbufp->msg_size); /* 3rd: kernel VM. */ va = dump_map[1].pa_start + dump_map[1].pa_size; /* Find start of next chunk (from va). */ while (va < virtual_end) { /* Don't dump the buffer cache. */ if (va >= kmi.buffer_sva && va < kmi.buffer_eva) { va = kmi.buffer_eva; continue; } pvo = moea_pvo_find_va(kernel_pmap, va & ~ADDR_POFF, NULL); if (pvo != NULL && (pvo->pvo_pte.pte.pte_hi & PTE_VALID)) break; va += PAGE_SIZE; } if (va < virtual_end) { dump_map[2].pa_start = va; va += PAGE_SIZE; /* Find last page in chunk. */ while (va < virtual_end) { /* Don't run into the buffer cache. */ if (va == kmi.buffer_sva) break; pvo = moea_pvo_find_va(kernel_pmap, va & ~ADDR_POFF, NULL); if (pvo == NULL || !(pvo->pvo_pte.pte.pte_hi & PTE_VALID)) break; va += PAGE_SIZE; } dump_map[2].pa_size = va - dump_map[2].pa_start; } } Index: head/sys/powerpc/aim/mmu_oea64.c =================================================================== --- head/sys/powerpc/aim/mmu_oea64.c (revision 360886) +++ head/sys/powerpc/aim/mmu_oea64.c (revision 360887) @@ -1,3146 +1,3152 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008-2015 Nathan Whitehorn * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is also stored by the * logical address mapping module, this module may throw away valid virtual * to physical mappings at almost any time. However, invalidations of * mappings must be done as requested. * * In order to cope with hardware architectures which make virtual to * physical map invalidates expensive, this module may delay invalidate * reduced protection operations until such time as they are actually * necessary. This module is given full information as to which processors * are currently using which maps, and to when physical maps must be made * correct. */ #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmu_oea64.h" #include "mmu_if.h" #include "moea64_if.h" void moea64_release_vsid(uint64_t vsid); uintptr_t moea64_get_unique_vsid(void); #define DISABLE_TRANS(msr) msr = mfmsr(); mtmsr(msr & ~PSL_DR) #define ENABLE_TRANS(msr) mtmsr(msr) #define VSID_MAKE(sr, hash) ((sr) | (((hash) & 0xfffff) << 4)) #define VSID_TO_HASH(vsid) (((vsid) >> 4) & 0xfffff) #define VSID_HASH_MASK 0x0000007fffffffffULL /* * Locking semantics: * * There are two locks of interest: the page locks and the pmap locks, which * protect their individual PVO lists and are locked in that order. The contents * of all PVO entries are protected by the locks of their respective pmaps. * The pmap of any PVO is guaranteed not to change so long as the PVO is linked * into any list. * */ -#define PV_LOCK_PER_DOM (PA_LOCK_COUNT * 3) -#define PV_LOCK_COUNT (PV_LOCK_PER_DOM * MAXMEMDOM) +#define PV_LOCK_COUNT PA_LOCK_COUNT static struct mtx_padalign pv_lock[PV_LOCK_COUNT]; /* * Cheap NUMA-izing of the pv locks, to reduce contention across domains. * NUMA domains on POWER9 appear to be indexed as sparse memory spaces, with the * index at (N << 45). */ #ifdef __powerpc64__ -#define PV_LOCK_IDX(pa) (pa_index(pa) % PV_LOCK_PER_DOM + \ - (((pa) >> 45) % MAXMEMDOM) * PV_LOCK_PER_DOM) +#define PV_LOCK_IDX(pa) ((pa_index(pa) * (((pa) >> 45) + 1)) % PV_LOCK_COUNT) #else #define PV_LOCK_IDX(pa) (pa_index(pa) % PV_LOCK_COUNT) #endif #define PV_LOCKPTR(pa) ((struct mtx *)(&pv_lock[PV_LOCK_IDX(pa)])) #define PV_LOCK(pa) mtx_lock(PV_LOCKPTR(pa)) #define PV_UNLOCK(pa) mtx_unlock(PV_LOCKPTR(pa)) #define PV_LOCKASSERT(pa) mtx_assert(PV_LOCKPTR(pa), MA_OWNED) #define PV_PAGE_LOCK(m) PV_LOCK(VM_PAGE_TO_PHYS(m)) #define PV_PAGE_UNLOCK(m) PV_UNLOCK(VM_PAGE_TO_PHYS(m)) #define PV_PAGE_LOCKASSERT(m) PV_LOCKASSERT(VM_PAGE_TO_PHYS(m)) struct ofw_map { cell_t om_va; cell_t om_len; uint64_t om_pa; cell_t om_mode; }; extern unsigned char _etext[]; extern unsigned char _end[]; extern void *slbtrap, *slbtrapend; /* * Map of physical memory regions. */ static struct mem_region *regions; static struct mem_region *pregions; static struct numa_mem_region *numa_pregions; static u_int phys_avail_count; static int regions_sz, pregions_sz, numapregions_sz; extern void bs_remap_earlyboot(void); /* * Lock for the SLB tables. */ struct mtx moea64_slb_mutex; /* * PTEG data. */ u_long moea64_pteg_count; u_long moea64_pteg_mask; /* * PVO data. */ uma_zone_t moea64_pvo_zone; /* zone for pvo entries */ static struct pvo_entry *moea64_bpvo_pool; static int moea64_bpvo_pool_index = 0; static int moea64_bpvo_pool_size = 0; SYSCTL_INT(_machdep, OID_AUTO, moea64_allocated_bpvo_entries, CTLFLAG_RD, &moea64_bpvo_pool_index, 0, ""); #define BPVO_POOL_SIZE 327680 /* Sensible historical default value */ #define BPVO_POOL_EXPANSION_FACTOR 3 #define VSID_NBPW (sizeof(u_int32_t) * 8) #ifdef __powerpc64__ #define NVSIDS (NPMAPS * 16) #define VSID_HASHMASK 0xffffffffUL #else #define NVSIDS NPMAPS #define VSID_HASHMASK 0xfffffUL #endif static u_int moea64_vsid_bitmap[NVSIDS / VSID_NBPW]; static boolean_t moea64_initialized = FALSE; #ifdef MOEA64_STATS /* * Statistics. */ u_int moea64_pte_valid = 0; u_int moea64_pte_overflow = 0; u_int moea64_pvo_entries = 0; u_int moea64_pvo_enter_calls = 0; u_int moea64_pvo_remove_calls = 0; SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_valid, CTLFLAG_RD, &moea64_pte_valid, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_overflow, CTLFLAG_RD, &moea64_pte_overflow, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_entries, CTLFLAG_RD, &moea64_pvo_entries, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_enter_calls, CTLFLAG_RD, &moea64_pvo_enter_calls, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_remove_calls, CTLFLAG_RD, &moea64_pvo_remove_calls, 0, ""); #endif vm_offset_t moea64_scratchpage_va[2]; struct pvo_entry *moea64_scratchpage_pvo[2]; struct mtx moea64_scratchpage_mtx; uint64_t moea64_large_page_mask = 0; uint64_t moea64_large_page_size = 0; int moea64_large_page_shift = 0; /* * PVO calls. */ static int moea64_pvo_enter(mmu_t mmu, struct pvo_entry *pvo, struct pvo_head *pvo_head, struct pvo_entry **oldpvo); static void moea64_pvo_remove_from_pmap(mmu_t mmu, struct pvo_entry *pvo); static void moea64_pvo_remove_from_page(mmu_t mmu, struct pvo_entry *pvo); static void moea64_pvo_remove_from_page_locked(mmu_t mmu, struct pvo_entry *pvo, vm_page_t m); static struct pvo_entry *moea64_pvo_find_va(pmap_t, vm_offset_t); /* * Utility routines. */ static boolean_t moea64_query_bit(mmu_t, vm_page_t, uint64_t); static u_int moea64_clear_bit(mmu_t, vm_page_t, uint64_t); static void moea64_kremove(mmu_t, vm_offset_t); static void moea64_syncicache(mmu_t, pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_size_t sz); static void moea64_pmap_init_qpages(void); /* * Kernel MMU interface */ void moea64_clear_modify(mmu_t, vm_page_t); void moea64_copy_page(mmu_t, vm_page_t, vm_page_t); void moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize); int moea64_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int flags, int8_t psind); void moea64_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); void moea64_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t); vm_paddr_t moea64_extract(mmu_t, pmap_t, vm_offset_t); vm_page_t moea64_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t); void moea64_init(mmu_t); boolean_t moea64_is_modified(mmu_t, vm_page_t); boolean_t moea64_is_prefaultable(mmu_t, pmap_t, vm_offset_t); boolean_t moea64_is_referenced(mmu_t, vm_page_t); int moea64_ts_referenced(mmu_t, vm_page_t); vm_offset_t moea64_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int); boolean_t moea64_page_exists_quick(mmu_t, pmap_t, vm_page_t); void moea64_page_init(mmu_t, vm_page_t); int moea64_page_wired_mappings(mmu_t, vm_page_t); void moea64_pinit(mmu_t, pmap_t); void moea64_pinit0(mmu_t, pmap_t); void moea64_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); void moea64_qenter(mmu_t, vm_offset_t, vm_page_t *, int); void moea64_qremove(mmu_t, vm_offset_t, int); void moea64_release(mmu_t, pmap_t); void moea64_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t); void moea64_remove_pages(mmu_t, pmap_t); void moea64_remove_all(mmu_t, vm_page_t); void moea64_remove_write(mmu_t, vm_page_t); void moea64_unwire(mmu_t, pmap_t, vm_offset_t, vm_offset_t); void moea64_zero_page(mmu_t, vm_page_t); void moea64_zero_page_area(mmu_t, vm_page_t, int, int); void moea64_activate(mmu_t, struct thread *); void moea64_deactivate(mmu_t, struct thread *); void *moea64_mapdev(mmu_t, vm_paddr_t, vm_size_t); void *moea64_mapdev_attr(mmu_t, vm_paddr_t, vm_size_t, vm_memattr_t); void moea64_unmapdev(mmu_t, vm_offset_t, vm_size_t); vm_paddr_t moea64_kextract(mmu_t, vm_offset_t); void moea64_page_set_memattr(mmu_t, vm_page_t m, vm_memattr_t ma); void moea64_kenter_attr(mmu_t, vm_offset_t, vm_paddr_t, vm_memattr_t ma); void moea64_kenter(mmu_t, vm_offset_t, vm_paddr_t); boolean_t moea64_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t); static void moea64_sync_icache(mmu_t, pmap_t, vm_offset_t, vm_size_t); void moea64_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va); void moea64_scan_init(mmu_t mmu); vm_offset_t moea64_quick_enter_page(mmu_t mmu, vm_page_t m); void moea64_quick_remove_page(mmu_t mmu, vm_offset_t addr); +boolean_t moea64_page_is_mapped(mmu_t mmu, vm_page_t m); static int moea64_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen); static int moea64_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr); static size_t moea64_scan_pmap(mmu_t mmu); static void *moea64_dump_pmap_init(mmu_t mmu, unsigned blkpgs); #ifdef __powerpc64__ static void moea64_page_array_startup(mmu_t, long); #endif static mmu_method_t moea64_methods[] = { MMUMETHOD(mmu_clear_modify, moea64_clear_modify), MMUMETHOD(mmu_copy_page, moea64_copy_page), MMUMETHOD(mmu_copy_pages, moea64_copy_pages), MMUMETHOD(mmu_enter, moea64_enter), MMUMETHOD(mmu_enter_object, moea64_enter_object), MMUMETHOD(mmu_enter_quick, moea64_enter_quick), MMUMETHOD(mmu_extract, moea64_extract), MMUMETHOD(mmu_extract_and_hold, moea64_extract_and_hold), MMUMETHOD(mmu_init, moea64_init), MMUMETHOD(mmu_is_modified, moea64_is_modified), MMUMETHOD(mmu_is_prefaultable, moea64_is_prefaultable), MMUMETHOD(mmu_is_referenced, moea64_is_referenced), MMUMETHOD(mmu_ts_referenced, moea64_ts_referenced), MMUMETHOD(mmu_map, moea64_map), MMUMETHOD(mmu_page_exists_quick,moea64_page_exists_quick), MMUMETHOD(mmu_page_init, moea64_page_init), MMUMETHOD(mmu_page_wired_mappings,moea64_page_wired_mappings), MMUMETHOD(mmu_pinit, moea64_pinit), MMUMETHOD(mmu_pinit0, moea64_pinit0), MMUMETHOD(mmu_protect, moea64_protect), MMUMETHOD(mmu_qenter, moea64_qenter), MMUMETHOD(mmu_qremove, moea64_qremove), MMUMETHOD(mmu_release, moea64_release), MMUMETHOD(mmu_remove, moea64_remove), MMUMETHOD(mmu_remove_pages, moea64_remove_pages), MMUMETHOD(mmu_remove_all, moea64_remove_all), MMUMETHOD(mmu_remove_write, moea64_remove_write), MMUMETHOD(mmu_sync_icache, moea64_sync_icache), MMUMETHOD(mmu_unwire, moea64_unwire), MMUMETHOD(mmu_zero_page, moea64_zero_page), MMUMETHOD(mmu_zero_page_area, moea64_zero_page_area), MMUMETHOD(mmu_activate, moea64_activate), MMUMETHOD(mmu_deactivate, moea64_deactivate), MMUMETHOD(mmu_page_set_memattr, moea64_page_set_memattr), MMUMETHOD(mmu_quick_enter_page, moea64_quick_enter_page), MMUMETHOD(mmu_quick_remove_page, moea64_quick_remove_page), + MMUMETHOD(mmu_page_is_mapped, moea64_page_is_mapped), #ifdef __powerpc64__ MMUMETHOD(mmu_page_array_startup, moea64_page_array_startup), #endif /* Internal interfaces */ MMUMETHOD(mmu_mapdev, moea64_mapdev), MMUMETHOD(mmu_mapdev_attr, moea64_mapdev_attr), MMUMETHOD(mmu_unmapdev, moea64_unmapdev), MMUMETHOD(mmu_kextract, moea64_kextract), MMUMETHOD(mmu_kenter, moea64_kenter), MMUMETHOD(mmu_kenter_attr, moea64_kenter_attr), MMUMETHOD(mmu_dev_direct_mapped,moea64_dev_direct_mapped), MMUMETHOD(mmu_scan_init, moea64_scan_init), MMUMETHOD(mmu_scan_pmap, moea64_scan_pmap), MMUMETHOD(mmu_dump_pmap_init, moea64_dump_pmap_init), MMUMETHOD(mmu_dumpsys_map, moea64_dumpsys_map), MMUMETHOD(mmu_map_user_ptr, moea64_map_user_ptr), MMUMETHOD(mmu_decode_kernel_ptr, moea64_decode_kernel_ptr), { 0, 0 } }; MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods, 0); static struct pvo_head * vm_page_to_pvoh(vm_page_t m) { mtx_assert(PV_LOCKPTR(VM_PAGE_TO_PHYS(m)), MA_OWNED); return (&m->md.mdpg_pvoh); } static struct pvo_entry * alloc_pvo_entry(int bootstrap) { struct pvo_entry *pvo; if (!moea64_initialized || bootstrap) { if (moea64_bpvo_pool_index >= moea64_bpvo_pool_size) { panic("%s: bpvo pool exhausted, index=%d, size=%d, bytes=%zd." "Try setting machdep.moea64_bpvo_pool_size tunable", __func__, moea64_bpvo_pool_index, moea64_bpvo_pool_size, moea64_bpvo_pool_size * sizeof(struct pvo_entry)); } pvo = &moea64_bpvo_pool[ atomic_fetchadd_int(&moea64_bpvo_pool_index, 1)]; bzero(pvo, sizeof(*pvo)); pvo->pvo_vaddr = PVO_BOOTSTRAP; } else pvo = uma_zalloc(moea64_pvo_zone, M_NOWAIT | M_ZERO); return (pvo); } static void init_pvo_entry(struct pvo_entry *pvo, pmap_t pmap, vm_offset_t va) { uint64_t vsid; uint64_t hash; int shift; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pvo->pvo_pmap = pmap; va &= ~ADDR_POFF; pvo->pvo_vaddr |= va; vsid = va_to_vsid(pmap, va); pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT) | (vsid << 16); shift = (pvo->pvo_vaddr & PVO_LARGE) ? moea64_large_page_shift : ADDR_PIDX_SHFT; hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift); pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3; } static void free_pvo_entry(struct pvo_entry *pvo) { if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP)) uma_zfree(moea64_pvo_zone, pvo); } void moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte) { lpte->pte_hi = moea64_pte_vpn_from_pvo_vpn(pvo); lpte->pte_hi |= LPTE_VALID; if (pvo->pvo_vaddr & PVO_LARGE) lpte->pte_hi |= LPTE_BIG; if (pvo->pvo_vaddr & PVO_WIRED) lpte->pte_hi |= LPTE_WIRED; if (pvo->pvo_vaddr & PVO_HID) lpte->pte_hi |= LPTE_HID; lpte->pte_lo = pvo->pvo_pte.pa; /* Includes WIMG bits */ if (pvo->pvo_pte.prot & VM_PROT_WRITE) lpte->pte_lo |= LPTE_BW; else lpte->pte_lo |= LPTE_BR; if (!(pvo->pvo_pte.prot & VM_PROT_EXECUTE)) lpte->pte_lo |= LPTE_NOEXEC; } static __inline uint64_t moea64_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) { uint64_t pte_lo; int i; if (ma != VM_MEMATTR_DEFAULT) { switch (ma) { case VM_MEMATTR_UNCACHEABLE: return (LPTE_I | LPTE_G); case VM_MEMATTR_CACHEABLE: return (LPTE_M); case VM_MEMATTR_WRITE_COMBINING: case VM_MEMATTR_WRITE_BACK: case VM_MEMATTR_PREFETCHABLE: return (LPTE_I); case VM_MEMATTR_WRITE_THROUGH: return (LPTE_W | LPTE_M); } } /* * Assume the page is cache inhibited and access is guarded unless * it's in our available memory array. */ pte_lo = LPTE_I | LPTE_G; for (i = 0; i < pregions_sz; i++) { if ((pa >= pregions[i].mr_start) && (pa < (pregions[i].mr_start + pregions[i].mr_size))) { pte_lo &= ~(LPTE_I | LPTE_G); pte_lo |= LPTE_M; break; } } return pte_lo; } /* * Quick sort callout for comparing memory regions. */ static int om_cmp(const void *a, const void *b); static int om_cmp(const void *a, const void *b) { const struct ofw_map *mapa; const struct ofw_map *mapb; mapa = a; mapb = b; if (mapa->om_pa < mapb->om_pa) return (-1); else if (mapa->om_pa > mapb->om_pa) return (1); else return (0); } static void moea64_add_ofw_mappings(mmu_t mmup, phandle_t mmu, size_t sz) { struct ofw_map translations[sz/(4*sizeof(cell_t))]; /*>= 4 cells per */ pcell_t acells, trans_cells[sz/sizeof(cell_t)]; struct pvo_entry *pvo; register_t msr; vm_offset_t off; vm_paddr_t pa_base; int i, j; bzero(translations, sz); OF_getencprop(OF_finddevice("/"), "#address-cells", &acells, sizeof(acells)); if (OF_getencprop(mmu, "translations", trans_cells, sz) == -1) panic("moea64_bootstrap: can't get ofw translations"); CTR0(KTR_PMAP, "moea64_add_ofw_mappings: translations"); sz /= sizeof(cell_t); for (i = 0, j = 0; i < sz; j++) { translations[j].om_va = trans_cells[i++]; translations[j].om_len = trans_cells[i++]; translations[j].om_pa = trans_cells[i++]; if (acells == 2) { translations[j].om_pa <<= 32; translations[j].om_pa |= trans_cells[i++]; } translations[j].om_mode = trans_cells[i++]; } KASSERT(i == sz, ("Translations map has incorrect cell count (%d/%zd)", i, sz)); sz = j; qsort(translations, sz, sizeof (*translations), om_cmp); for (i = 0; i < sz; i++) { pa_base = translations[i].om_pa; #ifndef __powerpc64__ if ((translations[i].om_pa >> 32) != 0) panic("OFW translations above 32-bit boundary!"); #endif if (pa_base % PAGE_SIZE) panic("OFW translation not page-aligned (phys)!"); if (translations[i].om_va % PAGE_SIZE) panic("OFW translation not page-aligned (virt)!"); CTR3(KTR_PMAP, "translation: pa=%#zx va=%#x len=%#x", pa_base, translations[i].om_va, translations[i].om_len); /* Now enter the pages for this mapping */ DISABLE_TRANS(msr); for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) { /* If this address is direct-mapped, skip remapping */ if (hw_direct_map && translations[i].om_va == PHYS_TO_DMAP(pa_base) && moea64_calc_wimg(pa_base + off, VM_MEMATTR_DEFAULT) == LPTE_M) continue; PMAP_LOCK(kernel_pmap); pvo = moea64_pvo_find_va(kernel_pmap, translations[i].om_va + off); PMAP_UNLOCK(kernel_pmap); if (pvo != NULL) continue; moea64_kenter(mmup, translations[i].om_va + off, pa_base + off); } ENABLE_TRANS(msr); } } #ifdef __powerpc64__ static void moea64_probe_large_page(void) { uint16_t pvr = mfpvr() >> 16; switch (pvr) { case IBM970: case IBM970FX: case IBM970MP: powerpc_sync(); isync(); mtspr(SPR_HID4, mfspr(SPR_HID4) & ~HID4_970_DISABLE_LG_PG); powerpc_sync(); isync(); /* FALLTHROUGH */ default: if (moea64_large_page_size == 0) { moea64_large_page_size = 0x1000000; /* 16 MB */ moea64_large_page_shift = 24; } } moea64_large_page_mask = moea64_large_page_size - 1; } static void moea64_bootstrap_slb_prefault(vm_offset_t va, int large) { struct slb *cache; struct slb entry; uint64_t esid, slbe; uint64_t i; cache = PCPU_GET(aim.slb); esid = va >> ADDR_SR_SHFT; slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; for (i = 0; i < 64; i++) { if (cache[i].slbe == (slbe | i)) return; } entry.slbe = slbe; entry.slbv = KERNEL_VSID(esid) << SLBV_VSID_SHIFT; if (large) entry.slbv |= SLBV_L; slb_insert_kernel(entry.slbe, entry.slbv); } #endif static int moea64_kenter_large(mmu_t mmup, vm_offset_t va, vm_paddr_t pa, uint64_t attr, int bootstrap) { struct pvo_entry *pvo; uint64_t pte_lo; int error; pte_lo = LPTE_M; pte_lo |= attr; pvo = alloc_pvo_entry(bootstrap); pvo->pvo_vaddr |= PVO_WIRED | PVO_LARGE; init_pvo_entry(pvo, kernel_pmap, va); pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; pvo->pvo_pte.pa = pa | pte_lo; error = moea64_pvo_enter(mmup, pvo, NULL, NULL); if (error != 0) panic("Error %d inserting large page\n", error); return (0); } static void moea64_setup_direct_map(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { register_t msr; vm_paddr_t pa, pkernelstart, pkernelend; vm_offset_t size, off; uint64_t pte_lo; int i; if (moea64_large_page_size == 0) hw_direct_map = 0; DISABLE_TRANS(msr); if (hw_direct_map) { PMAP_LOCK(kernel_pmap); for (i = 0; i < pregions_sz; i++) { for (pa = pregions[i].mr_start; pa < pregions[i].mr_start + pregions[i].mr_size; pa += moea64_large_page_size) { pte_lo = LPTE_M; if (pa & moea64_large_page_mask) { pa &= moea64_large_page_mask; pte_lo |= LPTE_G; } if (pa + moea64_large_page_size > pregions[i].mr_start + pregions[i].mr_size) pte_lo |= LPTE_G; moea64_kenter_large(mmup, PHYS_TO_DMAP(pa), pa, pte_lo, 1); } } PMAP_UNLOCK(kernel_pmap); } /* * Make sure the kernel and BPVO pool stay mapped on systems either * without a direct map or on which the kernel is not already executing * out of the direct-mapped region. */ if (kernelstart < DMAP_BASE_ADDRESS) { /* * For pre-dmap execution, we need to use identity mapping * because we will be operating with the mmu on but in the * wrong address configuration until we __restartkernel(). */ for (pa = kernelstart & ~PAGE_MASK; pa < kernelend; pa += PAGE_SIZE) moea64_kenter(mmup, pa, pa); } else if (!hw_direct_map) { pkernelstart = kernelstart & ~DMAP_BASE_ADDRESS; pkernelend = kernelend & ~DMAP_BASE_ADDRESS; for (pa = pkernelstart & ~PAGE_MASK; pa < pkernelend; pa += PAGE_SIZE) moea64_kenter(mmup, pa | DMAP_BASE_ADDRESS, pa); } if (!hw_direct_map) { size = moea64_bpvo_pool_size*sizeof(struct pvo_entry); off = (vm_offset_t)(moea64_bpvo_pool); for (pa = off; pa < off + size; pa += PAGE_SIZE) moea64_kenter(mmup, pa, pa); /* Map exception vectors */ for (pa = EXC_RSVD; pa < EXC_LAST; pa += PAGE_SIZE) moea64_kenter(mmup, pa | DMAP_BASE_ADDRESS, pa); } ENABLE_TRANS(msr); /* * Allow user to override unmapped_buf_allowed for testing. * XXXKIB Only direct map implementation was tested. */ if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", &unmapped_buf_allowed)) unmapped_buf_allowed = hw_direct_map; } /* Quick sort callout for comparing physical addresses. */ static int pa_cmp(const void *a, const void *b) { const vm_paddr_t *pa = a, *pb = b; if (*pa < *pb) return (-1); else if (*pa > *pb) return (1); else return (0); } void moea64_early_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { int i, j; vm_size_t physsz, hwphyssz; vm_paddr_t kernelphysstart, kernelphysend; int rm_pavail; #ifndef __powerpc64__ /* We don't have a direct map since there is no BAT */ hw_direct_map = 0; /* Make sure battable is zero, since we have no BAT */ for (i = 0; i < 16; i++) { battable[i].batu = 0; battable[i].batl = 0; } #else moea64_probe_large_page(); /* Use a direct map if we have large page support */ if (moea64_large_page_size > 0) hw_direct_map = 1; else hw_direct_map = 0; /* Install trap handlers for SLBs */ bcopy(&slbtrap, (void *)EXC_DSE,(size_t)&slbtrapend - (size_t)&slbtrap); bcopy(&slbtrap, (void *)EXC_ISE,(size_t)&slbtrapend - (size_t)&slbtrap); __syncicache((void *)EXC_DSE, 0x80); __syncicache((void *)EXC_ISE, 0x80); #endif kernelphysstart = kernelstart & ~DMAP_BASE_ADDRESS; kernelphysend = kernelend & ~DMAP_BASE_ADDRESS; /* Get physical memory regions from firmware */ mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); CTR0(KTR_PMAP, "moea64_bootstrap: physical memory"); if (PHYS_AVAIL_ENTRIES < regions_sz) panic("moea64_bootstrap: phys_avail too small"); phys_avail_count = 0; physsz = 0; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); for (i = 0, j = 0; i < regions_sz; i++, j += 2) { CTR3(KTR_PMAP, "region: %#zx - %#zx (%#zx)", regions[i].mr_start, regions[i].mr_start + regions[i].mr_size, regions[i].mr_size); if (hwphyssz != 0 && (physsz + regions[i].mr_size) >= hwphyssz) { if (physsz < hwphyssz) { phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + hwphyssz - physsz; physsz = hwphyssz; phys_avail_count++; dump_avail[j] = phys_avail[j]; dump_avail[j + 1] = phys_avail[j + 1]; } break; } phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; phys_avail_count++; physsz += regions[i].mr_size; dump_avail[j] = phys_avail[j]; dump_avail[j + 1] = phys_avail[j + 1]; } /* Check for overlap with the kernel and exception vectors */ rm_pavail = 0; for (j = 0; j < 2*phys_avail_count; j+=2) { if (phys_avail[j] < EXC_LAST) phys_avail[j] += EXC_LAST; if (phys_avail[j] >= kernelphysstart && phys_avail[j+1] <= kernelphysend) { phys_avail[j] = phys_avail[j+1] = ~0; rm_pavail++; continue; } if (kernelphysstart >= phys_avail[j] && kernelphysstart < phys_avail[j+1]) { if (kernelphysend < phys_avail[j+1]) { phys_avail[2*phys_avail_count] = (kernelphysend & ~PAGE_MASK) + PAGE_SIZE; phys_avail[2*phys_avail_count + 1] = phys_avail[j+1]; phys_avail_count++; } phys_avail[j+1] = kernelphysstart & ~PAGE_MASK; } if (kernelphysend >= phys_avail[j] && kernelphysend < phys_avail[j+1]) { if (kernelphysstart > phys_avail[j]) { phys_avail[2*phys_avail_count] = phys_avail[j]; phys_avail[2*phys_avail_count + 1] = kernelphysstart & ~PAGE_MASK; phys_avail_count++; } phys_avail[j] = (kernelphysend & ~PAGE_MASK) + PAGE_SIZE; } } /* Remove physical available regions marked for removal (~0) */ if (rm_pavail) { qsort(phys_avail, 2*phys_avail_count, sizeof(phys_avail[0]), pa_cmp); phys_avail_count -= rm_pavail; for (i = 2*phys_avail_count; i < 2*(phys_avail_count + rm_pavail); i+=2) phys_avail[i] = phys_avail[i+1] = 0; } physmem = btoc(physsz); #ifdef PTEGCOUNT moea64_pteg_count = PTEGCOUNT; #else moea64_pteg_count = 0x1000; while (moea64_pteg_count < physmem) moea64_pteg_count <<= 1; moea64_pteg_count >>= 1; #endif /* PTEGCOUNT */ } void moea64_mid_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { int i; /* * Set PTEG mask */ moea64_pteg_mask = moea64_pteg_count - 1; /* * Initialize SLB table lock and page locks */ mtx_init(&moea64_slb_mutex, "SLB table", NULL, MTX_DEF); for (i = 0; i < PV_LOCK_COUNT; i++) mtx_init(&pv_lock[i], "page pv", NULL, MTX_DEF); /* * Initialise the bootstrap pvo pool. */ TUNABLE_INT_FETCH("machdep.moea64_bpvo_pool_size", &moea64_bpvo_pool_size); if (moea64_bpvo_pool_size == 0) { if (!hw_direct_map) moea64_bpvo_pool_size = ((ptoa((uintmax_t)physmem) * sizeof(struct vm_page)) / (PAGE_SIZE * PAGE_SIZE)) * BPVO_POOL_EXPANSION_FACTOR; else moea64_bpvo_pool_size = BPVO_POOL_SIZE; } if (boothowto & RB_VERBOSE) { printf("mmu_oea64: bpvo pool entries = %d, bpvo pool size = %zu MB\n", moea64_bpvo_pool_size, moea64_bpvo_pool_size*sizeof(struct pvo_entry) / 1048576); } moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc( moea64_bpvo_pool_size*sizeof(struct pvo_entry), PAGE_SIZE); moea64_bpvo_pool_index = 0; /* Place at address usable through the direct map */ if (hw_direct_map) moea64_bpvo_pool = (struct pvo_entry *) PHYS_TO_DMAP((uintptr_t)moea64_bpvo_pool); /* * Make sure kernel vsid is allocated as well as VSID 0. */ #ifndef __powerpc64__ moea64_vsid_bitmap[(KERNEL_VSIDBITS & (NVSIDS - 1)) / VSID_NBPW] |= 1 << (KERNEL_VSIDBITS % VSID_NBPW); moea64_vsid_bitmap[0] |= 1; #endif /* * Initialize the kernel pmap (which is statically allocated). */ #ifdef __powerpc64__ for (i = 0; i < 64; i++) { pcpup->pc_aim.slb[i].slbv = 0; pcpup->pc_aim.slb[i].slbe = 0; } #else for (i = 0; i < 16; i++) kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i; #endif kernel_pmap->pmap_phys = kernel_pmap; CPU_FILL(&kernel_pmap->pm_active); RB_INIT(&kernel_pmap->pmap_pvo); PMAP_LOCK_INIT(kernel_pmap); /* * Now map in all the other buffers we allocated earlier */ moea64_setup_direct_map(mmup, kernelstart, kernelend); } void moea64_late_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { ihandle_t mmui; phandle_t chosen; phandle_t mmu; ssize_t sz; int i; vm_offset_t pa, va; void *dpcpu; /* * Set up the Open Firmware pmap and add its mappings if not in real * mode. */ chosen = OF_finddevice("/chosen"); if (chosen != -1 && OF_getencprop(chosen, "mmu", &mmui, 4) != -1) { mmu = OF_instance_to_package(mmui); if (mmu == -1 || (sz = OF_getproplen(mmu, "translations")) == -1) sz = 0; if (sz > 6144 /* tmpstksz - 2 KB headroom */) panic("moea64_bootstrap: too many ofw translations"); if (sz > 0) moea64_add_ofw_mappings(mmup, mmu, sz); } /* * Calculate the last available physical address. */ Maxmem = 0; for (i = 0; phys_avail[i + 2] != 0; i += 2) Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1])); /* * Initialize MMU. */ MMU_CPU_BOOTSTRAP(mmup,0); mtmsr(mfmsr() | PSL_DR | PSL_IR); pmap_bootstrapped++; /* * Set the start and end of kva. */ virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; /* * Map the entire KVA range into the SLB. We must not fault there. */ #ifdef __powerpc64__ for (va = virtual_avail; va < virtual_end; va += SEGMENT_LENGTH) moea64_bootstrap_slb_prefault(va, 0); #endif /* * Remap any early IO mappings (console framebuffer, etc.) */ bs_remap_earlyboot(); /* * Figure out how far we can extend virtual_end into segment 16 * without running into existing mappings. Segment 16 is guaranteed * to contain neither RAM nor devices (at least on Apple hardware), * but will generally contain some OFW mappings we should not * step on. */ #ifndef __powerpc64__ /* KVA is in high memory on PPC64 */ PMAP_LOCK(kernel_pmap); while (virtual_end < VM_MAX_KERNEL_ADDRESS && moea64_pvo_find_va(kernel_pmap, virtual_end+1) == NULL) virtual_end += PAGE_SIZE; PMAP_UNLOCK(kernel_pmap); #endif /* * Allocate a kernel stack with a guard page for thread0 and map it * into the kernel page map. */ pa = moea64_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE); va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; virtual_avail = va + kstack_pages * PAGE_SIZE; CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); thread0.td_kstack = va; thread0.td_kstack_pages = kstack_pages; for (i = 0; i < kstack_pages; i++) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the message buffer. */ pa = msgbuf_phys = moea64_bootstrap_alloc(msgbufsize, PAGE_SIZE); msgbufp = (struct msgbuf *)virtual_avail; va = virtual_avail; virtual_avail += round_page(msgbufsize); while (va < virtual_avail) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the dynamic percpu area. */ pa = moea64_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE); dpcpu = (void *)virtual_avail; va = virtual_avail; virtual_avail += DPCPU_SIZE; while (va < virtual_avail) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } dpcpu_init(dpcpu, curcpu); crashdumpmap = (caddr_t)virtual_avail; virtual_avail += MAXDUMPPGS * PAGE_SIZE; /* * Allocate some things for page zeroing. We put this directly * in the page table and use MOEA64_PTE_REPLACE to avoid any * of the PVO book-keeping or other parts of the VM system * from even knowing that this hack exists. */ if (!hw_direct_map) { mtx_init(&moea64_scratchpage_mtx, "pvo zero page", NULL, MTX_DEF); for (i = 0; i < 2; i++) { moea64_scratchpage_va[i] = (virtual_end+1) - PAGE_SIZE; virtual_end -= PAGE_SIZE; moea64_kenter(mmup, moea64_scratchpage_va[i], 0); PMAP_LOCK(kernel_pmap); moea64_scratchpage_pvo[i] = moea64_pvo_find_va( kernel_pmap, (vm_offset_t)moea64_scratchpage_va[i]); PMAP_UNLOCK(kernel_pmap); } } numa_mem_regions(&numa_pregions, &numapregions_sz); } static void moea64_pmap_init_qpages(void) { struct pcpu *pc; int i; if (hw_direct_map) return; CPU_FOREACH(i) { pc = pcpu_find(i); pc->pc_qmap_addr = kva_alloc(PAGE_SIZE); if (pc->pc_qmap_addr == 0) panic("pmap_init_qpages: unable to allocate KVA"); PMAP_LOCK(kernel_pmap); pc->pc_aim.qmap_pvo = moea64_pvo_find_va(kernel_pmap, pc->pc_qmap_addr); PMAP_UNLOCK(kernel_pmap); mtx_init(&pc->pc_aim.qmap_lock, "qmap lock", NULL, MTX_DEF); } } SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, moea64_pmap_init_qpages, NULL); /* * Activate a user pmap. This mostly involves setting some non-CPU * state. */ void moea64_activate(mmu_t mmu, struct thread *td) { pmap_t pm; pm = &td->td_proc->p_vmspace->vm_pmap; CPU_SET(PCPU_GET(cpuid), &pm->pm_active); #ifdef __powerpc64__ PCPU_SET(aim.userslb, pm->pm_slb); __asm __volatile("slbmte %0, %1; isync" :: "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE)); #else PCPU_SET(curpmap, pm->pmap_phys); mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid); #endif } void moea64_deactivate(mmu_t mmu, struct thread *td) { pmap_t pm; __asm __volatile("isync; slbie %0" :: "r"(USER_ADDR)); pm = &td->td_proc->p_vmspace->vm_pmap; CPU_CLR(PCPU_GET(cpuid), &pm->pm_active); #ifdef __powerpc64__ PCPU_SET(aim.userslb, NULL); #else PCPU_SET(curpmap, NULL); #endif } void moea64_unwire(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry key, *pvo; vm_page_t m; int64_t refchg; key.pvo_vaddr = sva; PMAP_LOCK(pm); for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { if ((pvo->pvo_vaddr & PVO_WIRED) == 0) panic("moea64_unwire: pvo %p is missing PVO_WIRED", pvo); pvo->pvo_vaddr &= ~PVO_WIRED; refchg = MOEA64_PTE_REPLACE(mmu, pvo, 0 /* No invalidation */); if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { if (refchg < 0) refchg = LPTE_CHG; m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); refchg |= atomic_readandclear_32(&m->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(m); if (refchg & LPTE_REF) vm_page_aflag_set(m, PGA_REFERENCED); } pm->pm_stats.wired_count--; } PMAP_UNLOCK(pm); } /* * This goes through and sets the physical address of our * special scratch PTE to the PA we want to zero or copy. Because * of locking issues (this can get called in pvo_enter() by * the UMA allocator), we can't use most other utility functions here */ static __inline void moea64_set_scratchpage_pa(mmu_t mmup, int which, vm_paddr_t pa) { struct pvo_entry *pvo; KASSERT(!hw_direct_map, ("Using OEA64 scratchpage with a direct map!")); mtx_assert(&moea64_scratchpage_mtx, MA_OWNED); pvo = moea64_scratchpage_pvo[which]; PMAP_LOCK(pvo->pvo_pmap); pvo->pvo_pte.pa = moea64_calc_wimg(pa, VM_MEMATTR_DEFAULT) | (uint64_t)pa; MOEA64_PTE_REPLACE(mmup, pvo, MOEA64_PTE_INVALIDATE); PMAP_UNLOCK(pvo->pvo_pmap); isync(); } void moea64_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst) { vm_offset_t dst; vm_offset_t src; dst = VM_PAGE_TO_PHYS(mdst); src = VM_PAGE_TO_PHYS(msrc); if (hw_direct_map) { bcopy((void *)PHYS_TO_DMAP(src), (void *)PHYS_TO_DMAP(dst), PAGE_SIZE); } else { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, src); moea64_set_scratchpage_pa(mmu, 1, dst); bcopy((void *)moea64_scratchpage_va[0], (void *)moea64_scratchpage_va[1], PAGE_SIZE); mtx_unlock(&moea64_scratchpage_mtx); } } static inline void moea64_copy_pages_dmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); a_cp = (char *)(uintptr_t)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); b_cp = (char *)(uintptr_t)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } static inline void moea64_copy_pages_nodmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; mtx_lock(&moea64_scratchpage_mtx); while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); moea64_set_scratchpage_pa(mmu, 0, VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])); a_cp = (char *)moea64_scratchpage_va[0] + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); moea64_set_scratchpage_pa(mmu, 1, VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])); b_cp = (char *)moea64_scratchpage_va[1] + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } mtx_unlock(&moea64_scratchpage_mtx); } void moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { if (hw_direct_map) { moea64_copy_pages_dmap(mmu, ma, a_offset, mb, b_offset, xfersize); } else { moea64_copy_pages_nodmap(mmu, ma, a_offset, mb, b_offset, xfersize); } } void moea64_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size) { vm_paddr_t pa = VM_PAGE_TO_PHYS(m); if (size + off > PAGE_SIZE) panic("moea64_zero_page: size + off > PAGE_SIZE"); if (hw_direct_map) { bzero((caddr_t)(uintptr_t)PHYS_TO_DMAP(pa) + off, size); } else { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, pa); bzero((caddr_t)moea64_scratchpage_va[0] + off, size); mtx_unlock(&moea64_scratchpage_mtx); } } /* * Zero a page of physical memory by temporarily mapping it */ void moea64_zero_page(mmu_t mmu, vm_page_t m) { vm_paddr_t pa = VM_PAGE_TO_PHYS(m); vm_offset_t va, off; if (!hw_direct_map) { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, pa); va = moea64_scratchpage_va[0]; } else { va = PHYS_TO_DMAP(pa); } for (off = 0; off < PAGE_SIZE; off += cacheline_size) __asm __volatile("dcbz 0,%0" :: "r"(va + off)); if (!hw_direct_map) mtx_unlock(&moea64_scratchpage_mtx); } vm_offset_t moea64_quick_enter_page(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; vm_paddr_t pa = VM_PAGE_TO_PHYS(m); if (hw_direct_map) return (PHYS_TO_DMAP(pa)); /* * MOEA64_PTE_REPLACE does some locking, so we can't just grab * a critical section and access the PCPU data like on i386. * Instead, pin the thread and grab the PCPU lock to prevent * a preempting thread from using the same PCPU data. */ sched_pin(); mtx_assert(PCPU_PTR(aim.qmap_lock), MA_NOTOWNED); pvo = PCPU_GET(aim.qmap_pvo); mtx_lock(PCPU_PTR(aim.qmap_lock)); pvo->pvo_pte.pa = moea64_calc_wimg(pa, pmap_page_get_memattr(m)) | (uint64_t)pa; MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_INVALIDATE); isync(); return (PCPU_GET(qmap_addr)); } void moea64_quick_remove_page(mmu_t mmu, vm_offset_t addr) { if (hw_direct_map) return; mtx_assert(PCPU_PTR(aim.qmap_lock), MA_OWNED); KASSERT(PCPU_GET(qmap_addr) == addr, ("moea64_quick_remove_page: invalid address")); mtx_unlock(PCPU_PTR(aim.qmap_lock)); sched_unpin(); +} + +boolean_t +moea64_page_is_mapped(mmu_t mmu, vm_page_t m) +{ + return (!LIST_EMPTY(&(m)->md.mdpg_pvoh)); } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. */ int moea64_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct pvo_entry *pvo, *oldpvo; struct pvo_head *pvo_head; uint64_t pte_lo; int error; if ((m->oflags & VPO_UNMANAGED) == 0) { if ((flags & PMAP_ENTER_QUICK_LOCKED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); else VM_OBJECT_ASSERT_LOCKED(m->object); } pvo = alloc_pvo_entry(0); if (pvo == NULL) return (KERN_RESOURCE_SHORTAGE); pvo->pvo_pmap = NULL; /* to be filled in later */ pvo->pvo_pte.prot = prot; pte_lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m)); pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | pte_lo; if ((flags & PMAP_ENTER_WIRED) != 0) pvo->pvo_vaddr |= PVO_WIRED; if ((m->oflags & VPO_UNMANAGED) != 0 || !moea64_initialized) { pvo_head = NULL; } else { pvo_head = &m->md.mdpg_pvoh; pvo->pvo_vaddr |= PVO_MANAGED; } PV_PAGE_LOCK(m); PMAP_LOCK(pmap); if (pvo->pvo_pmap == NULL) init_pvo_entry(pvo, pmap, va); if (prot & VM_PROT_WRITE) if (pmap_bootstrapped && (m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); error = moea64_pvo_enter(mmu, pvo, pvo_head, &oldpvo); if (error == EEXIST) { if (oldpvo->pvo_vaddr == pvo->pvo_vaddr && oldpvo->pvo_pte.pa == pvo->pvo_pte.pa && oldpvo->pvo_pte.prot == prot) { /* Identical mapping already exists */ error = 0; /* If not in page table, reinsert it */ if (MOEA64_PTE_SYNCH(mmu, oldpvo) < 0) { STAT_MOEA64(moea64_pte_overflow--); MOEA64_PTE_INSERT(mmu, oldpvo); } /* Then just clean up and go home */ PV_PAGE_UNLOCK(m); PMAP_UNLOCK(pmap); free_pvo_entry(pvo); goto out; } else { /* Otherwise, need to kill it first */ KASSERT(oldpvo->pvo_pmap == pmap, ("pmap of old " "mapping does not match new mapping")); moea64_pvo_remove_from_pmap(mmu, oldpvo); moea64_pvo_enter(mmu, pvo, pvo_head, NULL); } } PMAP_UNLOCK(pmap); PV_PAGE_UNLOCK(m); /* Free any dead pages */ if (error == EEXIST) { moea64_pvo_remove_from_page(mmu, oldpvo); free_pvo_entry(oldpvo); } out: /* * Flush the page from the instruction cache if this page is * mapped executable and cacheable. */ if (pmap != kernel_pmap && (m->a.flags & PGA_EXECUTABLE) == 0 && (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { vm_page_aflag_set(m, PGA_EXECUTABLE); moea64_syncicache(mmu, pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE); } return (KERN_SUCCESS); } static void moea64_syncicache(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_size_t sz) { /* * This is much trickier than on older systems because * we can't sync the icache on physical addresses directly * without a direct map. Instead we check a couple of cases * where the memory is already mapped in and, failing that, * use the same trick we use for page zeroing to create * a temporary mapping for this physical address. */ if (!pmap_bootstrapped) { /* * If PMAP is not bootstrapped, we are likely to be * in real mode. */ __syncicache((void *)(uintptr_t)pa, sz); } else if (pmap == kernel_pmap) { __syncicache((void *)va, sz); } else if (hw_direct_map) { __syncicache((void *)(uintptr_t)PHYS_TO_DMAP(pa), sz); } else { /* Use the scratch page to set up a temp mapping */ mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 1, pa & ~ADDR_POFF); __syncicache((void *)(moea64_scratchpage_va[1] + (va & ADDR_POFF)), sz); mtx_unlock(&moea64_scratchpage_mtx); } } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void moea64_enter_object(mmu_t mmu, pmap_t pm, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { moea64_enter(mmu, pm, start + ptoa(diff), m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, 0); m = TAILQ_NEXT(m, listq); } } void moea64_enter_quick(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot) { moea64_enter(mmu, pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, 0); } vm_paddr_t moea64_extract(mmu_t mmu, pmap_t pm, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; PMAP_LOCK(pm); pvo = moea64_pvo_find_va(pm, va); if (pvo == NULL) pa = 0; else pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(pm); return (pa); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. */ vm_page_t moea64_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot) { struct pvo_entry *pvo; vm_page_t m; m = NULL; PMAP_LOCK(pmap); pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); if (pvo != NULL && (pvo->pvo_pte.prot & prot) == prot) { m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); if (!vm_page_wire_mapped(m)) m = NULL; } PMAP_UNLOCK(pmap); return (m); } static mmu_t installed_mmu; static void * moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait) { struct pvo_entry *pvo; vm_offset_t va; vm_page_t m; int needed_lock; /* * This entire routine is a horrible hack to avoid bothering kmem * for new KVA addresses. Because this can get called from inside * kmem allocation routines, calling kmem for a new address here * can lead to multiply locking non-recursive mutexes. */ *flags = UMA_SLAB_PRIV; needed_lock = !PMAP_LOCKED(kernel_pmap); m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); if (m == NULL) return (NULL); va = VM_PAGE_TO_PHYS(m); pvo = alloc_pvo_entry(1 /* bootstrap */); pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE; pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | LPTE_M; if (needed_lock) PMAP_LOCK(kernel_pmap); init_pvo_entry(pvo, kernel_pmap, va); pvo->pvo_vaddr |= PVO_WIRED; moea64_pvo_enter(installed_mmu, pvo, NULL, NULL); if (needed_lock) PMAP_UNLOCK(kernel_pmap); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero((void *)va, PAGE_SIZE); return (void *)va; } extern int elf32_nxstack; void moea64_init(mmu_t mmu) { CTR0(KTR_PMAP, "moea64_init"); moea64_pvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); if (!hw_direct_map) { installed_mmu = mmu; uma_zone_set_allocf(moea64_pvo_zone, moea64_uma_page_alloc); } #ifdef COMPAT_FREEBSD32 elf32_nxstack = 1; #endif moea64_initialized = TRUE; } boolean_t moea64_is_referenced(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_referenced: page %p is not managed", m)); return (moea64_query_bit(mmu, m, LPTE_REF)); } boolean_t moea64_is_modified(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_modified: page %p is not managed", m)); /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); return (moea64_query_bit(mmu, m, LPTE_CHG)); } boolean_t moea64_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t va) { struct pvo_entry *pvo; boolean_t rv = TRUE; PMAP_LOCK(pmap); pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); if (pvo != NULL) rv = FALSE; PMAP_UNLOCK(pmap); return (rv); } void moea64_clear_modify(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; moea64_clear_bit(mmu, m, LPTE_CHG); } /* * Clear the write and modified bits in each of the given page's mappings. */ void moea64_remove_write(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; int64_t refchg, ret; pmap_t pmap; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return powerpc_sync(); PV_PAGE_LOCK(m); refchg = 0; LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { pvo->pvo_pte.prot &= ~VM_PROT_WRITE; ret = MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_PROT_UPDATE); if (ret < 0) ret = LPTE_CHG; refchg |= ret; if (pvo->pvo_pmap == kernel_pmap) isync(); } PMAP_UNLOCK(pmap); } if ((refchg | atomic_readandclear_32(&m->md.mdpg_attrs)) & LPTE_CHG) vm_page_dirty(m); vm_page_aflag_clear(m, PGA_WRITEABLE); PV_PAGE_UNLOCK(m); } /* * moea64_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int moea64_ts_referenced(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_ts_referenced: page %p is not managed", m)); return (moea64_clear_bit(mmu, m, LPTE_REF)); } /* * Modify the WIMG settings of all mappings for a page. */ void moea64_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma) { struct pvo_entry *pvo; int64_t refchg; pmap_t pmap; uint64_t lo; if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; } lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), ma); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) { pvo->pvo_pte.pa &= ~LPTE_WIMG; pvo->pvo_pte.pa |= lo; refchg = MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_INVALIDATE); if (refchg < 0) refchg = (pvo->pvo_pte.prot & VM_PROT_WRITE) ? LPTE_CHG : 0; if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { refchg |= atomic_readandclear_32(&m->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(m); if (refchg & LPTE_REF) vm_page_aflag_set(m, PGA_REFERENCED); } if (pvo->pvo_pmap == kernel_pmap) isync(); } PMAP_UNLOCK(pmap); } m->md.mdpg_cache_attrs = ma; PV_PAGE_UNLOCK(m); } /* * Map a wired page into kernel virtual address space. */ void moea64_kenter_attr(mmu_t mmu, vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { int error; struct pvo_entry *pvo, *oldpvo; do { pvo = alloc_pvo_entry(0); if (pvo == NULL) vm_wait(NULL); } while (pvo == NULL); pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; pvo->pvo_pte.pa = (pa & ~ADDR_POFF) | moea64_calc_wimg(pa, ma); pvo->pvo_vaddr |= PVO_WIRED; PMAP_LOCK(kernel_pmap); oldpvo = moea64_pvo_find_va(kernel_pmap, va); if (oldpvo != NULL) moea64_pvo_remove_from_pmap(mmu, oldpvo); init_pvo_entry(pvo, kernel_pmap, va); error = moea64_pvo_enter(mmu, pvo, NULL, NULL); PMAP_UNLOCK(kernel_pmap); /* Free any dead pages */ if (oldpvo != NULL) { moea64_pvo_remove_from_page(mmu, oldpvo); free_pvo_entry(oldpvo); } if (error != 0) panic("moea64_kenter: failed to enter va %#zx pa %#jx: %d", va, (uintmax_t)pa, error); } void moea64_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa) { moea64_kenter_attr(mmu, va, pa, VM_MEMATTR_DEFAULT); } /* * Extract the physical page address associated with the given kernel virtual * address. */ vm_paddr_t moea64_kextract(mmu_t mmu, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; /* * Shortcut the direct-mapped case when applicable. We never put * anything but 1:1 (or 62-bit aliased) mappings below * VM_MIN_KERNEL_ADDRESS. */ if (va < VM_MIN_KERNEL_ADDRESS) return (va & ~DMAP_BASE_ADDRESS); PMAP_LOCK(kernel_pmap); pvo = moea64_pvo_find_va(kernel_pmap, va); KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR, va)); pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(kernel_pmap); return (pa); } /* * Remove a wired page from kernel virtual address space. */ void moea64_kremove(mmu_t mmu, vm_offset_t va) { moea64_remove(mmu, kernel_pmap, va, va + PAGE_SIZE); } /* * Provide a kernel pointer corresponding to a given userland pointer. * The returned pointer is valid until the next time this function is * called in this thread. This is used internally in copyin/copyout. */ static int moea64_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen) { size_t l; #ifdef __powerpc64__ struct slb *slb; #endif register_t slbv; *kaddr = (char *)USER_ADDR + ((uintptr_t)uaddr & ~SEGMENT_MASK); l = ((char *)USER_ADDR + SEGMENT_LENGTH) - (char *)(*kaddr); if (l > ulen) l = ulen; if (klen) *klen = l; else if (l != ulen) return (EFAULT); #ifdef __powerpc64__ /* Try lockless look-up first */ slb = user_va_to_slb_entry(pm, (vm_offset_t)uaddr); if (slb == NULL) { /* If it isn't there, we need to pre-fault the VSID */ PMAP_LOCK(pm); slbv = va_to_vsid(pm, (vm_offset_t)uaddr) << SLBV_VSID_SHIFT; PMAP_UNLOCK(pm); } else { slbv = slb->slbv; } /* Mark segment no-execute */ slbv |= SLBV_N; #else slbv = va_to_vsid(pm, (vm_offset_t)uaddr); /* Mark segment no-execute */ slbv |= SR_N; #endif /* If we have already set this VSID, we can just return */ if (curthread->td_pcb->pcb_cpu.aim.usr_vsid == slbv) return (0); __asm __volatile("isync"); curthread->td_pcb->pcb_cpu.aim.usr_segm = (uintptr_t)uaddr >> ADDR_SR_SHFT; curthread->td_pcb->pcb_cpu.aim.usr_vsid = slbv; #ifdef __powerpc64__ __asm __volatile ("slbie %0; slbmte %1, %2; isync" :: "r"(USER_ADDR), "r"(slbv), "r"(USER_SLB_SLBE)); #else __asm __volatile("mtsr %0,%1; isync" :: "n"(USER_SR), "r"(slbv)); #endif return (0); } /* * Figure out where a given kernel pointer (usually in a fault) points * to from the VM's perspective, potentially remapping into userland's * address space. */ static int moea64_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr) { vm_offset_t user_sr; if ((addr >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) { user_sr = curthread->td_pcb->pcb_cpu.aim.usr_segm; addr &= ADDR_PIDX | ADDR_POFF; addr |= user_sr << ADDR_SR_SHFT; *decoded_addr = addr; *is_user = 1; } else { *decoded_addr = addr; *is_user = 0; } return (0); } /* * Map a range of physical addresses into kernel virtual address space. * * The value passed in *virt is a suggested virtual address for the mapping. * Architectures which can support a direct-mapped physical to virtual region * can return the appropriate address within that region, leaving '*virt' * unchanged. Other architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped region. */ vm_offset_t moea64_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start, vm_paddr_t pa_end, int prot) { vm_offset_t sva, va; if (hw_direct_map) { /* * Check if every page in the region is covered by the direct * map. The direct map covers all of physical memory. Use * moea64_calc_wimg() as a shortcut to see if the page is in * physical memory as a way to see if the direct map covers it. */ for (va = pa_start; va < pa_end; va += PAGE_SIZE) if (moea64_calc_wimg(va, VM_MEMATTR_DEFAULT) != LPTE_M) break; if (va == pa_end) return (PHYS_TO_DMAP(pa_start)); } sva = *virt; va = sva; /* XXX respect prot argument */ for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE) moea64_kenter(mmu, va, pa_start); *virt = va; return (sva); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t moea64_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m) { int loops; struct pvo_entry *pvo; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_page_exists_quick: page %p is not managed", m)); loops = 0; rv = FALSE; PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { if (!(pvo->pvo_vaddr & PVO_DEAD) && pvo->pvo_pmap == pmap) { rv = TRUE; break; } if (++loops >= 16) break; } PV_PAGE_UNLOCK(m); return (rv); } void moea64_page_init(mmu_t mmu __unused, vm_page_t m) { m->md.mdpg_attrs = 0; m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; LIST_INIT(&m->md.mdpg_pvoh); } /* * Return the number of managed mappings to the given physical page * that are wired. */ int moea64_page_wired_mappings(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) if ((pvo->pvo_vaddr & (PVO_DEAD | PVO_WIRED)) == PVO_WIRED) count++; PV_PAGE_UNLOCK(m); return (count); } static uintptr_t moea64_vsidcontext; uintptr_t moea64_get_unique_vsid(void) { u_int entropy; register_t hash; uint32_t mask; int i; entropy = 0; __asm __volatile("mftb %0" : "=r"(entropy)); mtx_lock(&moea64_slb_mutex); for (i = 0; i < NVSIDS; i += VSID_NBPW) { u_int n; /* * Create a new value by mutiplying by a prime and adding in * entropy from the timebase register. This is to make the * VSID more random so that the PT hash function collides * less often. (Note that the prime casues gcc to do shifts * instead of a multiply.) */ moea64_vsidcontext = (moea64_vsidcontext * 0x1105) + entropy; hash = moea64_vsidcontext & (NVSIDS - 1); if (hash == 0) /* 0 is special, avoid it */ continue; n = hash >> 5; mask = 1 << (hash & (VSID_NBPW - 1)); hash = (moea64_vsidcontext & VSID_HASHMASK); if (moea64_vsid_bitmap[n] & mask) { /* collision? */ /* anything free in this bucket? */ if (moea64_vsid_bitmap[n] == 0xffffffff) { entropy = (moea64_vsidcontext >> 20); continue; } i = ffs(~moea64_vsid_bitmap[n]) - 1; mask = 1 << i; hash &= rounddown2(VSID_HASHMASK, VSID_NBPW); hash |= i; } if (hash == VSID_VRMA) /* also special, avoid this too */ continue; KASSERT(!(moea64_vsid_bitmap[n] & mask), ("Allocating in-use VSID %#zx\n", hash)); moea64_vsid_bitmap[n] |= mask; mtx_unlock(&moea64_slb_mutex); return (hash); } mtx_unlock(&moea64_slb_mutex); panic("%s: out of segments",__func__); } #ifdef __powerpc64__ void moea64_pinit(mmu_t mmu, pmap_t pmap) { RB_INIT(&pmap->pmap_pvo); pmap->pm_slb_tree_root = slb_alloc_tree(); pmap->pm_slb = slb_alloc_user_cache(); pmap->pm_slb_len = 0; } #else void moea64_pinit(mmu_t mmu, pmap_t pmap) { int i; uint32_t hash; RB_INIT(&pmap->pmap_pvo); if (pmap_bootstrapped) pmap->pmap_phys = (pmap_t)moea64_kextract(mmu, (vm_offset_t)pmap); else pmap->pmap_phys = pmap; /* * Allocate some segment registers for this pmap. */ hash = moea64_get_unique_vsid(); for (i = 0; i < 16; i++) pmap->pm_sr[i] = VSID_MAKE(i, hash); KASSERT(pmap->pm_sr[0] != 0, ("moea64_pinit: pm_sr[0] = 0")); } #endif /* * Initialize the pmap associated with process 0. */ void moea64_pinit0(mmu_t mmu, pmap_t pm) { PMAP_LOCK_INIT(pm); moea64_pinit(mmu, pm); bzero(&pm->pm_stats, sizeof(pm->pm_stats)); } /* * Set the physical protection on the specified range of this map as requested. */ static void moea64_pvo_protect(mmu_t mmu, pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot) { struct vm_page *pg; vm_prot_t oldprot; int32_t refchg; PMAP_LOCK_ASSERT(pm, MA_OWNED); /* * Change the protection of the page. */ oldprot = pvo->pvo_pte.prot; pvo->pvo_pte.prot = prot; pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); /* * If the PVO is in the page table, update mapping */ refchg = MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_PROT_UPDATE); if (refchg < 0) refchg = (oldprot & VM_PROT_WRITE) ? LPTE_CHG : 0; if (pm != kernel_pmap && pg != NULL && (pg->a.flags & PGA_EXECUTABLE) == 0 && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { if ((pg->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(pg, PGA_EXECUTABLE); moea64_syncicache(mmu, pm, PVO_VADDR(pvo), pvo->pvo_pte.pa & LPTE_RPGN, PAGE_SIZE); } /* * Update vm about the REF/CHG bits if the page is managed and we have * removed write access. */ if (pg != NULL && (pvo->pvo_vaddr & PVO_MANAGED) && (oldprot & VM_PROT_WRITE)) { refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(pg); if (refchg & LPTE_REF) vm_page_aflag_set(pg, PGA_REFERENCED); } } void moea64_protect(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { struct pvo_entry *pvo, *tpvo, key; CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm, sva, eva, prot); KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap, ("moea64_protect: non current pmap")); if ((prot & VM_PROT_READ) == VM_PROT_NONE) { moea64_remove(mmu, pm, sva, eva); return; } PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); moea64_pvo_protect(mmu, pm, pvo, prot); } PMAP_UNLOCK(pm); } /* * Map a list of wired pages into kernel virtual address space. This is * intended for temporary mappings which do not need page modification or * references recorded. Existing mappings in the region are overwritten. */ void moea64_qenter(mmu_t mmu, vm_offset_t va, vm_page_t *m, int count) { while (count-- > 0) { moea64_kenter(mmu, va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } } /* * Remove page mappings from kernel virtual address space. Intended for * temporary mappings entered by moea64_qenter. */ void moea64_qremove(mmu_t mmu, vm_offset_t va, int count) { while (count-- > 0) { moea64_kremove(mmu, va); va += PAGE_SIZE; } } void moea64_release_vsid(uint64_t vsid) { int idx, mask; mtx_lock(&moea64_slb_mutex); idx = vsid & (NVSIDS-1); mask = 1 << (idx % VSID_NBPW); idx /= VSID_NBPW; KASSERT(moea64_vsid_bitmap[idx] & mask, ("Freeing unallocated VSID %#jx", vsid)); moea64_vsid_bitmap[idx] &= ~mask; mtx_unlock(&moea64_slb_mutex); } void moea64_release(mmu_t mmu, pmap_t pmap) { /* * Free segment registers' VSIDs */ #ifdef __powerpc64__ slb_free_tree(pmap); slb_free_user_cache(pmap->pm_slb); #else KASSERT(pmap->pm_sr[0] != 0, ("moea64_release: pm_sr[0] = 0")); moea64_release_vsid(VSID_TO_HASH(pmap->pm_sr[0])); #endif } /* * Remove all pages mapped by the specified pmap */ void moea64_remove_pages(mmu_t mmu, pmap_t pm) { struct pvo_entry *pvo, *tpvo; struct pvo_dlist tofree; SLIST_INIT(&tofree); PMAP_LOCK(pm); RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) { if (pvo->pvo_vaddr & PVO_WIRED) continue; /* * For locking reasons, remove this from the page table and * pmap, but save delinking from the vm_page for a second * pass */ moea64_pvo_remove_from_pmap(mmu, pvo); SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink); } PMAP_UNLOCK(pm); while (!SLIST_EMPTY(&tofree)) { pvo = SLIST_FIRST(&tofree); SLIST_REMOVE_HEAD(&tofree, pvo_dlink); moea64_pvo_remove_from_page(mmu, pvo); free_pvo_entry(pvo); } } /* * Remove the given range of addresses from the specified map. */ void moea64_remove(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry *pvo, *tpvo, key; struct pvo_dlist tofree; /* * Perform an unsynchronized read. This is, however, safe. */ if (pm->pm_stats.resident_count == 0) return; key.pvo_vaddr = sva; SLIST_INIT(&tofree); PMAP_LOCK(pm); for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); /* * For locking reasons, remove this from the page table and * pmap, but save delinking from the vm_page for a second * pass */ moea64_pvo_remove_from_pmap(mmu, pvo); SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink); } PMAP_UNLOCK(pm); while (!SLIST_EMPTY(&tofree)) { pvo = SLIST_FIRST(&tofree); SLIST_REMOVE_HEAD(&tofree, pvo_dlink); moea64_pvo_remove_from_page(mmu, pvo); free_pvo_entry(pvo); } } /* * Remove physical page from all pmaps in which it resides. moea64_pvo_remove() * will reflect changes in pte's back to the vm_page. */ void moea64_remove_all(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo, *next_pvo; struct pvo_head freequeue; int wasdead; pmap_t pmap; LIST_INIT(&freequeue); PV_PAGE_LOCK(m); LIST_FOREACH_SAFE(pvo, vm_page_to_pvoh(m), pvo_vlink, next_pvo) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); wasdead = (pvo->pvo_vaddr & PVO_DEAD); if (!wasdead) moea64_pvo_remove_from_pmap(mmu, pvo); moea64_pvo_remove_from_page_locked(mmu, pvo, m); if (!wasdead) LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink); PMAP_UNLOCK(pmap); } KASSERT(!pmap_page_is_mapped(m), ("Page still has mappings")); KASSERT((m->a.flags & PGA_WRITEABLE) == 0, ("Page still writable")); PV_PAGE_UNLOCK(m); /* Clean up UMA allocations */ LIST_FOREACH_SAFE(pvo, &freequeue, pvo_vlink, next_pvo) free_pvo_entry(pvo); } /* * Allocate a physical page of memory directly from the phys_avail map. * Can only be called from moea64_bootstrap before avail start and end are * calculated. */ vm_offset_t moea64_bootstrap_alloc(vm_size_t size, vm_size_t align) { vm_offset_t s, e; int i, j; size = round_page(size); for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (align != 0) s = roundup2(phys_avail[i], align); else s = phys_avail[i]; e = s + size; if (s < phys_avail[i] || e > phys_avail[i + 1]) continue; if (s + size > platform_real_maxaddr()) continue; if (s == phys_avail[i]) { phys_avail[i] += size; } else if (e == phys_avail[i + 1]) { phys_avail[i + 1] -= size; } else { for (j = phys_avail_count * 2; j > i; j -= 2) { phys_avail[j] = phys_avail[j - 2]; phys_avail[j + 1] = phys_avail[j - 1]; } phys_avail[i + 3] = phys_avail[i + 1]; phys_avail[i + 1] = s; phys_avail[i + 2] = e; phys_avail_count++; } return (s); } panic("moea64_bootstrap_alloc: could not allocate memory"); } static int moea64_pvo_enter(mmu_t mmu, struct pvo_entry *pvo, struct pvo_head *pvo_head, struct pvo_entry **oldpvop) { struct pvo_entry *old_pvo; int err; PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); STAT_MOEA64(moea64_pvo_enter_calls++); /* * Add to pmap list */ old_pvo = RB_INSERT(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); if (old_pvo != NULL) { if (oldpvop != NULL) *oldpvop = old_pvo; return (EEXIST); } if (pvo_head != NULL) { LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink); } if (pvo->pvo_vaddr & PVO_WIRED) pvo->pvo_pmap->pm_stats.wired_count++; pvo->pvo_pmap->pm_stats.resident_count++; /* * Insert it into the hardware page table */ err = MOEA64_PTE_INSERT(mmu, pvo); if (err != 0) { panic("moea64_pvo_enter: overflow"); } STAT_MOEA64(moea64_pvo_entries++); if (pvo->pvo_pmap == kernel_pmap) isync(); #ifdef __powerpc64__ /* * Make sure all our bootstrap mappings are in the SLB as soon * as virtual memory is switched on. */ if (!pmap_bootstrapped) moea64_bootstrap_slb_prefault(PVO_VADDR(pvo), pvo->pvo_vaddr & PVO_LARGE); #endif return (0); } static void moea64_pvo_remove_from_pmap(mmu_t mmu, struct pvo_entry *pvo) { struct vm_page *pg; int32_t refchg; KASSERT(pvo->pvo_pmap != NULL, ("Trying to remove PVO with no pmap")); PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); KASSERT(!(pvo->pvo_vaddr & PVO_DEAD), ("Trying to remove dead PVO")); /* * If there is an active pte entry, we need to deactivate it */ refchg = MOEA64_PTE_UNSET(mmu, pvo); if (refchg < 0) { /* * If it was evicted from the page table, be pessimistic and * dirty the page. */ if (pvo->pvo_pte.prot & VM_PROT_WRITE) refchg = LPTE_CHG; else refchg = 0; } /* * Update our statistics. */ pvo->pvo_pmap->pm_stats.resident_count--; if (pvo->pvo_vaddr & PVO_WIRED) pvo->pvo_pmap->pm_stats.wired_count--; /* * Remove this PVO from the pmap list. */ RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); /* * Mark this for the next sweep */ pvo->pvo_vaddr |= PVO_DEAD; /* Send RC bits to VM */ if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); if (pg != NULL) { refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(pg); if (refchg & LPTE_REF) vm_page_aflag_set(pg, PGA_REFERENCED); } } } static inline void moea64_pvo_remove_from_page_locked(mmu_t mmu, struct pvo_entry *pvo, vm_page_t m) { KASSERT(pvo->pvo_vaddr & PVO_DEAD, ("Trying to delink live page")); /* Use NULL pmaps as a sentinel for races in page deletion */ if (pvo->pvo_pmap == NULL) return; pvo->pvo_pmap = NULL; /* * Update vm about page writeability/executability if managed */ PV_LOCKASSERT(pvo->pvo_pte.pa & LPTE_RPGN); if (pvo->pvo_vaddr & PVO_MANAGED) { if (m != NULL) { LIST_REMOVE(pvo, pvo_vlink); if (LIST_EMPTY(vm_page_to_pvoh(m))) vm_page_aflag_clear(m, PGA_WRITEABLE | PGA_EXECUTABLE); } } STAT_MOEA64(moea64_pvo_entries--); STAT_MOEA64(moea64_pvo_remove_calls++); } static void moea64_pvo_remove_from_page(mmu_t mmu, struct pvo_entry *pvo) { vm_page_t pg = NULL; if (pvo->pvo_vaddr & PVO_MANAGED) pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); PV_LOCK(pvo->pvo_pte.pa & LPTE_RPGN); moea64_pvo_remove_from_page_locked(mmu, pvo, pg); PV_UNLOCK(pvo->pvo_pte.pa & LPTE_RPGN); } static struct pvo_entry * moea64_pvo_find_va(pmap_t pm, vm_offset_t va) { struct pvo_entry key; PMAP_LOCK_ASSERT(pm, MA_OWNED); key.pvo_vaddr = va & ~ADDR_POFF; return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key)); } static boolean_t moea64_query_bit(mmu_t mmu, vm_page_t m, uint64_t ptebit) { struct pvo_entry *pvo; int64_t ret; boolean_t rv; /* * See if this bit is stored in the page already. */ if (m->md.mdpg_attrs & ptebit) return (TRUE); /* * Examine each PTE. Sync so that any pending REF/CHG bits are * flushed to the PTEs. */ rv = FALSE; powerpc_sync(); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { ret = 0; /* * See if this pvo has a valid PTE. if so, fetch the * REF/CHG bits from the valid PTE. If the appropriate * ptebit is set, return success. */ PMAP_LOCK(pvo->pvo_pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) ret = MOEA64_PTE_SYNCH(mmu, pvo); PMAP_UNLOCK(pvo->pvo_pmap); if (ret > 0) { atomic_set_32(&m->md.mdpg_attrs, ret & (LPTE_CHG | LPTE_REF)); if (ret & ptebit) { rv = TRUE; break; } } } PV_PAGE_UNLOCK(m); return (rv); } static u_int moea64_clear_bit(mmu_t mmu, vm_page_t m, u_int64_t ptebit) { u_int count; struct pvo_entry *pvo; int64_t ret; /* * Sync so that any pending REF/CHG bits are flushed to the PTEs (so * we can reset the right ones). */ powerpc_sync(); /* * For each pvo entry, clear the pte's ptebit. */ count = 0; PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { ret = 0; PMAP_LOCK(pvo->pvo_pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) ret = MOEA64_PTE_CLEAR(mmu, pvo, ptebit); PMAP_UNLOCK(pvo->pvo_pmap); if (ret > 0 && (ret & ptebit)) count++; } atomic_clear_32(&m->md.mdpg_attrs, ptebit); PV_PAGE_UNLOCK(m); return (count); } boolean_t moea64_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { struct pvo_entry *pvo, key; vm_offset_t ppa; int error = 0; if (hw_direct_map && mem_valid(pa, size) == 0) return (0); PMAP_LOCK(kernel_pmap); ppa = pa & ~ADDR_POFF; key.pvo_vaddr = DMAP_BASE_ADDRESS + ppa; for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key); ppa < pa + size; ppa += PAGE_SIZE, pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) { if (pvo == NULL || (pvo->pvo_pte.pa & LPTE_RPGN) != ppa) { error = EFAULT; break; } } PMAP_UNLOCK(kernel_pmap); return (error); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * moea64_mapdev_attr(mmu_t mmu, vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { vm_offset_t va, tmpva, ppa, offset; ppa = trunc_page(pa); offset = pa & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); va = kva_alloc(size); if (!va) panic("moea64_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0;) { moea64_kenter_attr(mmu, tmpva, ppa, ma); size -= PAGE_SIZE; tmpva += PAGE_SIZE; ppa += PAGE_SIZE; } return ((void *)(va + offset)); } void * moea64_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { return moea64_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT); } void moea64_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size) { vm_offset_t base, offset; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); kva_free(base, size); } void moea64_sync_icache(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_size_t sz) { struct pvo_entry *pvo; vm_offset_t lim; vm_paddr_t pa; vm_size_t len; if (__predict_false(pm == NULL)) pm = &curthread->td_proc->p_vmspace->vm_pmap; PMAP_LOCK(pm); while (sz > 0) { lim = round_page(va+1); len = MIN(lim - va, sz); pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF); if (pvo != NULL && !(pvo->pvo_pte.pa & LPTE_I)) { pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va & ADDR_POFF); moea64_syncicache(mmu, pm, va, pa, len); } va += len; sz -= len; } PMAP_UNLOCK(pm); } void moea64_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va) { *va = (void *)(uintptr_t)pa; } extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1]; void moea64_scan_init(mmu_t mmu) { struct pvo_entry *pvo; vm_offset_t va; int i; if (!do_minidump) { /* Initialize phys. segments for dumpsys(). */ memset(&dump_map, 0, sizeof(dump_map)); mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); for (i = 0; i < pregions_sz; i++) { dump_map[i].pa_start = pregions[i].mr_start; dump_map[i].pa_size = pregions[i].mr_size; } return; } /* Virtual segments for minidumps: */ memset(&dump_map, 0, sizeof(dump_map)); /* 1st: kernel .data and .bss. */ dump_map[0].pa_start = trunc_page((uintptr_t)_etext); dump_map[0].pa_size = round_page((uintptr_t)_end) - dump_map[0].pa_start; /* 2nd: msgbuf and tables (see pmap_bootstrap()). */ dump_map[1].pa_start = (vm_paddr_t)(uintptr_t)msgbufp->msg_ptr; dump_map[1].pa_size = round_page(msgbufp->msg_size); /* 3rd: kernel VM. */ va = dump_map[1].pa_start + dump_map[1].pa_size; /* Find start of next chunk (from va). */ while (va < virtual_end) { /* Don't dump the buffer cache. */ if (va >= kmi.buffer_sva && va < kmi.buffer_eva) { va = kmi.buffer_eva; continue; } pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD)) break; va += PAGE_SIZE; } if (va < virtual_end) { dump_map[2].pa_start = va; va += PAGE_SIZE; /* Find last page in chunk. */ while (va < virtual_end) { /* Don't run into the buffer cache. */ if (va == kmi.buffer_sva) break; pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD)) break; va += PAGE_SIZE; } dump_map[2].pa_size = va - dump_map[2].pa_start; } } #ifdef __powerpc64__ static size_t moea64_scan_pmap(mmu_t mmu) { struct pvo_entry *pvo; vm_paddr_t pa, pa_end; vm_offset_t va, pgva, kstart, kend, kstart_lp, kend_lp; uint64_t lpsize; lpsize = moea64_large_page_size; kstart = trunc_page((vm_offset_t)_etext); kend = round_page((vm_offset_t)_end); kstart_lp = kstart & ~moea64_large_page_mask; kend_lp = (kend + moea64_large_page_mask) & ~moea64_large_page_mask; CTR4(KTR_PMAP, "moea64_scan_pmap: kstart=0x%016lx, kend=0x%016lx, " "kstart_lp=0x%016lx, kend_lp=0x%016lx", kstart, kend, kstart_lp, kend_lp); PMAP_LOCK(kernel_pmap); RB_FOREACH(pvo, pvo_tree, &kernel_pmap->pmap_pvo) { va = pvo->pvo_vaddr; if (va & PVO_DEAD) continue; /* Skip DMAP (except kernel area) */ if (va >= DMAP_BASE_ADDRESS && va <= DMAP_MAX_ADDRESS) { if (va & PVO_LARGE) { pgva = va & ~moea64_large_page_mask; if (pgva < kstart_lp || pgva >= kend_lp) continue; } else { pgva = trunc_page(va); if (pgva < kstart || pgva >= kend) continue; } } pa = pvo->pvo_pte.pa & LPTE_RPGN; if (va & PVO_LARGE) { pa_end = pa + lpsize; for (; pa < pa_end; pa += PAGE_SIZE) { if (is_dumpable(pa)) dump_add_page(pa); } } else { if (is_dumpable(pa)) dump_add_page(pa); } } PMAP_UNLOCK(kernel_pmap); return (sizeof(struct lpte) * moea64_pteg_count * 8); } static struct dump_context dump_ctx; static void * moea64_dump_pmap_init(mmu_t mmu, unsigned blkpgs) { dump_ctx.ptex = 0; dump_ctx.ptex_end = moea64_pteg_count * 8; dump_ctx.blksz = blkpgs * PAGE_SIZE; return (&dump_ctx); } #else static size_t moea64_scan_pmap(mmu_t mmu) { return (0); } static void * moea64_dump_pmap_init(mmu_t mmu, unsigned blkpgs) { return (NULL); } #endif #ifdef __powerpc64__ static void moea64_map_range(mmu_t mmu, vm_offset_t va, vm_paddr_t pa, vm_size_t npages) { for (; npages > 0; --npages) { if (moea64_large_page_size != 0 && (pa & moea64_large_page_mask) == 0 && (va & moea64_large_page_mask) == 0 && npages >= (moea64_large_page_size >> PAGE_SHIFT)) { PMAP_LOCK(kernel_pmap); moea64_kenter_large(mmu, va, pa, 0, 0); PMAP_UNLOCK(kernel_pmap); pa += moea64_large_page_size; va += moea64_large_page_size; npages -= (moea64_large_page_size >> PAGE_SHIFT) - 1; } else { moea64_kenter(mmu, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } } } static void moea64_page_array_startup(mmu_t mmu, long pages) { long dom_pages[MAXMEMDOM]; vm_paddr_t pa; vm_offset_t va, vm_page_base; vm_size_t needed, size; long page; int domain; int i; vm_page_base = 0xd000000000000000ULL; /* Short-circuit single-domain systems. */ if (vm_ndomains == 1) { size = round_page(pages * sizeof(struct vm_page)); pa = vm_phys_early_alloc(0, size); vm_page_base = moea64_map(mmu, &vm_page_base, pa, pa + size, VM_PROT_READ | VM_PROT_WRITE); vm_page_array_size = pages; vm_page_array = (vm_page_t)vm_page_base; return; } page = 0; for (i = 0; i < MAXMEMDOM; i++) dom_pages[i] = 0; /* Now get the number of pages required per domain. */ for (i = 0; i < vm_phys_nsegs; i++) { domain = vm_phys_segs[i].domain; KASSERT(domain < MAXMEMDOM, ("Invalid vm_phys_segs NUMA domain %d!\n", domain)); /* Get size of vm_page_array needed for this segment. */ size = btoc(vm_phys_segs[i].end - vm_phys_segs[i].start); dom_pages[domain] += size; } for (i = 0; phys_avail[i + 1] != 0; i+= 2) { domain = _vm_phys_domain(phys_avail[i]); KASSERT(domain < MAXMEMDOM, ("Invalid phys_avail NUMA domain %d!\n", domain)); size = btoc(phys_avail[i + 1] - phys_avail[i]); dom_pages[domain] += size; } /* * Map in chunks that can get us all 16MB pages. There will be some * overlap between domains, but that's acceptable for now. */ vm_page_array_size = 0; va = vm_page_base; for (i = 0; i < MAXMEMDOM && vm_page_array_size < pages; i++) { if (dom_pages[i] == 0) continue; size = ulmin(pages - vm_page_array_size, dom_pages[i]); size = round_page(size * sizeof(struct vm_page)); needed = size; size = roundup2(size, moea64_large_page_size); pa = vm_phys_early_alloc(i, size); vm_page_array_size += size / sizeof(struct vm_page); moea64_map_range(mmu, va, pa, size >> PAGE_SHIFT); /* Scoot up domain 0, to reduce the domain page overlap. */ if (i == 0) vm_page_base += size - needed; va += size; } vm_page_array = (vm_page_t)vm_page_base; vm_page_array_size = pages; } #endif Index: head/sys/powerpc/aim/mmu_radix.c =================================================================== --- head/sys/powerpc/aim/mmu_radix.c (nonexistent) +++ head/sys/powerpc/aim/mmu_radix.c (revision 360887) @@ -0,0 +1,6507 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2018 Matthew Macy + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include +__FBSDID("$FreeBSD$"); + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef INVARIANTS +#include +#endif + +#define PPC_BITLSHIFT(bit) (sizeof(long)*NBBY - 1 - (bit)) +#define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit)) +#define PPC_BITLSHIFT_VAL(val, bit) ((val) << PPC_BITLSHIFT(bit)) + +#include "opt_ddb.h" +#ifdef DDB +static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va); +#endif + +#define PG_W RPTE_WIRED +#define PG_V RPTE_VALID +#define PG_MANAGED RPTE_MANAGED +#define PG_PROMOTED RPTE_PROMOTED +#define PG_M RPTE_C +#define PG_A RPTE_R +#define PG_X RPTE_EAA_X +#define PG_RW RPTE_EAA_W +#define PG_PTE_CACHE RPTE_ATTR_MASK + +#define RPTE_SHIFT 9 +#define NLS_MASK ((1UL<<5)-1) +#define RPTE_ENTRIES (1UL<> L3_PAGE_SIZE_SHIFT); +} + +static __inline vm_pindex_t +pmap_pml3e_index(vm_offset_t va) +{ + + return ((va >> L3_PAGE_SIZE_SHIFT) & RPTE_MASK); +} + +static __inline vm_pindex_t +pmap_pml2e_index(vm_offset_t va) +{ + return ((va >> L2_PAGE_SIZE_SHIFT) & RPTE_MASK); +} + +static __inline vm_pindex_t +pmap_pml1e_index(vm_offset_t va) +{ + return ((va & PG_FRAME) >> L1_PAGE_SIZE_SHIFT); +} + +/* Return various clipped indexes for a given VA */ +static __inline vm_pindex_t +pmap_pte_index(vm_offset_t va) +{ + + return ((va >> PAGE_SHIFT) & RPTE_MASK); +} + +/* Return a pointer to the PT slot that corresponds to a VA */ +static __inline pt_entry_t * +pmap_l3e_to_pte(pt_entry_t *l3e, vm_offset_t va) +{ + pt_entry_t *pte; + vm_paddr_t ptepa; + + ptepa = (*l3e & NLB_MASK); + pte = (pt_entry_t *)PHYS_TO_DMAP(ptepa); + return (&pte[pmap_pte_index(va)]); +} + +/* Return a pointer to the PD slot that corresponds to a VA */ +static __inline pt_entry_t * +pmap_l2e_to_l3e(pt_entry_t *l2e, vm_offset_t va) +{ + pt_entry_t *l3e; + vm_paddr_t l3pa; + + l3pa = (*l2e & NLB_MASK); + l3e = (pml3_entry_t *)PHYS_TO_DMAP(l3pa); + return (&l3e[pmap_pml3e_index(va)]); +} + +/* Return a pointer to the PD slot that corresponds to a VA */ +static __inline pt_entry_t * +pmap_l1e_to_l2e(pt_entry_t *l1e, vm_offset_t va) +{ + pt_entry_t *l2e; + vm_paddr_t l2pa; + + l2pa = (*l1e & NLB_MASK); + + l2e = (pml2_entry_t *)PHYS_TO_DMAP(l2pa); + return (&l2e[pmap_pml2e_index(va)]); +} + +static __inline pml1_entry_t * +pmap_pml1e(pmap_t pmap, vm_offset_t va) +{ + + return (&pmap->pm_pml1[pmap_pml1e_index(va)]); +} + +static pt_entry_t * +pmap_pml2e(pmap_t pmap, vm_offset_t va) +{ + pt_entry_t *l1e; + + l1e = pmap_pml1e(pmap, va); + if (l1e == NULL || (*l1e & RPTE_VALID) == 0) + return (NULL); + return (pmap_l1e_to_l2e(l1e, va)); +} + +static __inline pt_entry_t * +pmap_pml3e(pmap_t pmap, vm_offset_t va) +{ + pt_entry_t *l2e; + + l2e = pmap_pml2e(pmap, va); + if (l2e == NULL || (*l2e & RPTE_VALID) == 0) + return (NULL); + return (pmap_l2e_to_l3e(l2e, va)); +} + +static __inline pt_entry_t * +pmap_pte(pmap_t pmap, vm_offset_t va) +{ + pt_entry_t *l3e; + + l3e = pmap_pml3e(pmap, va); + if (l3e == NULL || (*l3e & RPTE_VALID) == 0) + return (NULL); + return (pmap_l3e_to_pte(l3e, va)); +} + +int nkpt = 64; +SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, + "Number of kernel page table pages allocated on bootup"); + +vm_paddr_t dmaplimit; + +SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); + +static int pg_ps_enabled = 1; +SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &pg_ps_enabled, 0, "Are large page mappings enabled?"); +#ifdef INVARIANTS +#define VERBOSE_PMAP 0 +#define VERBOSE_PROTECT 0 +static int pmap_logging; +SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_logging, CTLFLAG_RWTUN, + &pmap_logging, 0, "verbose debug logging"); +#endif + +static u_int64_t KPTphys; /* phys addr of kernel level 1 */ + +//static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ + +static vm_offset_t qframe = 0; +static struct mtx qframe_mtx; +static epoch_t pmap_epoch; + +void mmu_radix_activate(mmu_t mmu, struct thread *); +void mmu_radix_advise(mmu_t mmu, pmap_t, vm_offset_t, vm_offset_t, int); +void mmu_radix_align_superpage(mmu_t mmu, vm_object_t, vm_ooffset_t, vm_offset_t *, + vm_size_t); +void mmu_radix_clear_modify(mmu_t, vm_page_t); +void mmu_radix_copy(mmu_t, pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); +int mmu_radix_map_user_ptr(mmu_t mmu, pmap_t pm, + volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen); +int mmu_radix_decode_kernel_ptr(mmu_t, vm_offset_t, int *, vm_offset_t *); +int mmu_radix_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t); +void mmu_radix_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t, + vm_prot_t); +void mmu_radix_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t); +vm_paddr_t mmu_radix_extract(mmu_t, pmap_t pmap, vm_offset_t va); +vm_page_t mmu_radix_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t); +void mmu_radix_kenter(mmu_t, vm_offset_t, vm_paddr_t); +vm_paddr_t mmu_radix_kextract(mmu_t, vm_offset_t); +void mmu_radix_kremove(mmu_t, vm_offset_t); +boolean_t mmu_radix_is_modified(mmu_t, vm_page_t); +boolean_t mmu_radix_is_prefaultable(mmu_t, pmap_t, vm_offset_t); +boolean_t mmu_radix_is_referenced(mmu_t, vm_page_t); +void mmu_radix_object_init_pt(mmu_t, pmap_t, vm_offset_t, vm_object_t, + vm_pindex_t, vm_size_t); +boolean_t mmu_radix_page_exists_quick(mmu_t, pmap_t, vm_page_t); +void mmu_radix_page_init(mmu_t, vm_page_t); +boolean_t mmu_radix_page_is_mapped(mmu_t, vm_page_t m); +void mmu_radix_page_set_memattr(mmu_t, vm_page_t, vm_memattr_t); +int mmu_radix_page_wired_mappings(mmu_t, vm_page_t); +void mmu_radix_pinit(mmu_t, pmap_t); +void mmu_radix_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); +boolean_t mmu_radix_ps_enabled(mmu_t, pmap_t); +void mmu_radix_qenter(mmu_t, vm_offset_t, vm_page_t *, int); +void mmu_radix_qremove(mmu_t, vm_offset_t, int); +vm_offset_t mmu_radix_quick_enter_page(mmu_t, vm_page_t); +void mmu_radix_quick_remove_page(mmu_t, vm_offset_t); +boolean_t mmu_radix_ts_referenced(mmu_t, vm_page_t); +void mmu_radix_release(mmu_t, pmap_t); +void mmu_radix_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t); +void mmu_radix_remove_all(mmu_t, vm_page_t); +void mmu_radix_remove_pages(mmu_t, pmap_t); +void mmu_radix_remove_write(mmu_t, vm_page_t); +void mmu_radix_unwire(mmu_t, pmap_t, vm_offset_t, vm_offset_t); +void mmu_radix_zero_page(mmu_t, vm_page_t); +void mmu_radix_zero_page_area(mmu_t, vm_page_t, int, int); +int mmu_radix_change_attr(mmu_t, vm_offset_t, vm_size_t, vm_memattr_t); +void mmu_radix_page_array_startup(mmu_t mmu, long pages); + +#include "mmu_oea64.h" +#include "mmu_if.h" +#include "moea64_if.h" + +/* + * Kernel MMU interface + */ + +static void mmu_radix_bootstrap(mmu_t mmup, + vm_offset_t kernelstart, vm_offset_t kernelend); + +static void mmu_radix_copy_page(mmu_t, vm_page_t, vm_page_t); +static void mmu_radix_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, + vm_page_t *mb, vm_offset_t b_offset, int xfersize); +static void mmu_radix_growkernel(mmu_t, vm_offset_t); +static void mmu_radix_init(mmu_t); +static int mmu_radix_mincore(mmu_t, pmap_t, vm_offset_t, vm_paddr_t *); +static vm_offset_t mmu_radix_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int); +static void mmu_radix_pinit0(mmu_t, pmap_t); + +static void *mmu_radix_mapdev(mmu_t, vm_paddr_t, vm_size_t); +static void *mmu_radix_mapdev_attr(mmu_t, vm_paddr_t, vm_size_t, vm_memattr_t); +static void mmu_radix_unmapdev(mmu_t, vm_offset_t, vm_size_t); +static void mmu_radix_kenter_attr(mmu_t, vm_offset_t, vm_paddr_t, vm_memattr_t ma); +static boolean_t mmu_radix_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t); +static void mmu_radix_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, + void **va); +static void mmu_radix_scan_init(mmu_t mmu); +static void mmu_radix_cpu_bootstrap(mmu_t, int ap); + +static mmu_method_t mmu_radix_methods[] = { + MMUMETHOD(mmu_bootstrap, mmu_radix_bootstrap), + MMUMETHOD(mmu_copy_page, mmu_radix_copy_page), + MMUMETHOD(mmu_copy_pages, mmu_radix_copy_pages), + MMUMETHOD(mmu_cpu_bootstrap, mmu_radix_cpu_bootstrap), + MMUMETHOD(mmu_growkernel, mmu_radix_growkernel), + MMUMETHOD(mmu_init, mmu_radix_init), + MMUMETHOD(mmu_map, mmu_radix_map), + MMUMETHOD(mmu_mincore, mmu_radix_mincore), + MMUMETHOD(mmu_pinit, mmu_radix_pinit), + MMUMETHOD(mmu_pinit0, mmu_radix_pinit0), + + MMUMETHOD(mmu_mapdev, mmu_radix_mapdev), + MMUMETHOD(mmu_mapdev_attr, mmu_radix_mapdev_attr), + MMUMETHOD(mmu_unmapdev, mmu_radix_unmapdev), + MMUMETHOD(mmu_kenter_attr, mmu_radix_kenter_attr), + MMUMETHOD(mmu_dev_direct_mapped,mmu_radix_dev_direct_mapped), + MMUMETHOD(mmu_scan_init, mmu_radix_scan_init), + MMUMETHOD(mmu_dumpsys_map, mmu_radix_dumpsys_map), + MMUMETHOD(mmu_page_is_mapped, mmu_radix_page_is_mapped), + MMUMETHOD(mmu_ps_enabled, mmu_radix_ps_enabled), + MMUMETHOD(mmu_object_init_pt, mmu_radix_object_init_pt), + MMUMETHOD(mmu_protect, mmu_radix_protect), + /* pmap dispatcher interface */ + MMUMETHOD(mmu_clear_modify, mmu_radix_clear_modify), + MMUMETHOD(mmu_copy, mmu_radix_copy), + MMUMETHOD(mmu_enter, mmu_radix_enter), + MMUMETHOD(mmu_enter_object, mmu_radix_enter_object), + MMUMETHOD(mmu_enter_quick, mmu_radix_enter_quick), + MMUMETHOD(mmu_extract, mmu_radix_extract), + MMUMETHOD(mmu_extract_and_hold, mmu_radix_extract_and_hold), + MMUMETHOD(mmu_is_modified, mmu_radix_is_modified), + MMUMETHOD(mmu_is_prefaultable, mmu_radix_is_prefaultable), + MMUMETHOD(mmu_is_referenced, mmu_radix_is_referenced), + MMUMETHOD(mmu_ts_referenced, mmu_radix_ts_referenced), + MMUMETHOD(mmu_page_exists_quick,mmu_radix_page_exists_quick), + MMUMETHOD(mmu_page_init, mmu_radix_page_init), + MMUMETHOD(mmu_page_wired_mappings, mmu_radix_page_wired_mappings), + MMUMETHOD(mmu_qenter, mmu_radix_qenter), + MMUMETHOD(mmu_qremove, mmu_radix_qremove), + MMUMETHOD(mmu_release, mmu_radix_release), + MMUMETHOD(mmu_remove, mmu_radix_remove), + MMUMETHOD(mmu_remove_all, mmu_radix_remove_all), + MMUMETHOD(mmu_remove_write, mmu_radix_remove_write), + MMUMETHOD(mmu_unwire, mmu_radix_unwire), + MMUMETHOD(mmu_zero_page, mmu_radix_zero_page), + MMUMETHOD(mmu_zero_page_area, mmu_radix_zero_page_area), + MMUMETHOD(mmu_activate, mmu_radix_activate), + MMUMETHOD(mmu_quick_enter_page, mmu_radix_quick_enter_page), + MMUMETHOD(mmu_quick_remove_page, mmu_radix_quick_remove_page), + MMUMETHOD(mmu_page_set_memattr, mmu_radix_page_set_memattr), + MMUMETHOD(mmu_page_array_startup, mmu_radix_page_array_startup), + + /* Internal interfaces */ + MMUMETHOD(mmu_kenter, mmu_radix_kenter), + MMUMETHOD(mmu_kextract, mmu_radix_kextract), + MMUMETHOD(mmu_kremove, mmu_radix_kremove), + MMUMETHOD(mmu_change_attr, mmu_radix_change_attr), + MMUMETHOD(mmu_map_user_ptr, mmu_radix_map_user_ptr), + MMUMETHOD(mmu_decode_kernel_ptr, mmu_radix_decode_kernel_ptr), + { 0, 0 } +}; + +MMU_DEF(mmu_radix, MMU_TYPE_RADIX, mmu_radix_methods, 0); + +#define METHODVOID(m) mmu_radix_ ## m(mmu_t mmup) + +static boolean_t pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, + struct rwlock **lockp); +static boolean_t pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va); +static int pmap_unuse_pt(pmap_t, vm_offset_t, pml3_entry_t, struct spglist *); +static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, + struct spglist *free, struct rwlock **lockp); +static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, + pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp); +static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); +static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *pde, + struct spglist *free); +static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp); + +static bool pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e, + u_int flags, struct rwlock **lockp); +#if VM_NRESERVLEVEL > 0 +static void pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp); +#endif +static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); +static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); +static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate); + +static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, struct rwlock **lockp); +static int pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, + u_int flags, vm_page_t m, struct rwlock **lockp); + +static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); +static void free_pv_chunk(struct pv_chunk *pc); +static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); +static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va, + struct rwlock **lockp); +static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, + struct rwlock **lockp); +static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, + struct spglist *free); +static boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); + +static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start); +static void pmap_invalidate_all(pmap_t pmap); +static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush); + +/* + * Internal flags for pmap_enter()'s helper functions. + */ +#define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ +#define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ + +#define UNIMPLEMENTED() panic("%s not implemented", __func__) +#define UNTESTED() panic("%s not yet tested", __func__) + + + +/* Number of supported PID bits */ +static unsigned int isa3_pid_bits; + +/* PID to start allocating from */ +static unsigned int isa3_base_pid; + +#define PROCTAB_SIZE_SHIFT (isa3_pid_bits + 4) +#define PROCTAB_ENTRIES (1ul << isa3_pid_bits) + + +/* + * Map of physical memory regions. + */ +static struct mem_region *regions, *pregions; +static struct numa_mem_region *numa_pregions; +static u_int phys_avail_count; +static int regions_sz, pregions_sz, numa_pregions_sz; +static struct pate *isa3_parttab; +static struct prte *isa3_proctab; +static vmem_t *asid_arena; + +extern void bs_remap_earlyboot(void); + +#define RADIX_PGD_SIZE_SHIFT 16 +#define RADIX_PGD_SIZE (1UL << RADIX_PGD_SIZE_SHIFT) + +#define RADIX_PGD_INDEX_SHIFT (RADIX_PGD_SIZE_SHIFT-3) +#define NL2EPG (PAGE_SIZE/sizeof(pml2_entry_t)) +#define NL3EPG (PAGE_SIZE/sizeof(pml3_entry_t)) + +#define NUPML1E (RADIX_PGD_SIZE/sizeof(uint64_t)) /* number of userland PML1 pages */ +#define NUPDPE (NUPML1E * NL2EPG)/* number of userland PDP pages */ +#define NUPDE (NUPDPE * NL3EPG) /* number of userland PD entries */ + +/* POWER9 only permits a 64k partition table size. */ +#define PARTTAB_SIZE_SHIFT 16 +#define PARTTAB_SIZE (1UL << PARTTAB_SIZE_SHIFT) + +#define PARTTAB_HR (1UL << 63) /* host uses radix */ +#define PARTTAB_GR (1UL << 63) /* guest uses radix must match host */ + +/* TLB flush actions. Used as argument to tlbiel_all() */ +enum { + TLB_INVAL_SCOPE_LPID = 0, /* invalidate TLBs for current LPID */ + TLB_INVAL_SCOPE_GLOBAL = 1, /* invalidate all TLBs */ +}; + +#define NPV_LIST_LOCKS MAXCPU +static int pmap_initialized; +static vm_paddr_t proctab0pa; +static vm_paddr_t parttab_phys; +CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); + +/* + * Data for the pv entry allocation mechanism. + * Updates to pv_invl_gen are protected by the pv_list_locks[] + * elements, but reads are not. + */ +static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); +static struct mtx __exclusive_cache_line pv_chunks_mutex; +static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; +static struct md_page *pv_table; +static struct md_page pv_dummy; + +#ifdef PV_STATS +#define PV_STAT(x) do { x ; } while (0) +#else +#define PV_STAT(x) do { } while (0) +#endif + +#define pa_radix_index(pa) ((pa) >> L3_PAGE_SIZE_SHIFT) +#define pa_to_pvh(pa) (&pv_table[pa_radix_index(pa)]) + +#define PHYS_TO_PV_LIST_LOCK(pa) \ + (&pv_list_locks[pa_radix_index(pa) % NPV_LIST_LOCKS]) + +#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ + struct rwlock **_lockp = (lockp); \ + struct rwlock *_new_lock; \ + \ + _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ + if (_new_lock != *_lockp) { \ + if (*_lockp != NULL) \ + rw_wunlock(*_lockp); \ + *_lockp = _new_lock; \ + rw_wlock(*_lockp); \ + } \ +} while (0) + +#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) + +#define RELEASE_PV_LIST_LOCK(lockp) do { \ + struct rwlock **_lockp = (lockp); \ + \ + if (*_lockp != NULL) { \ + rw_wunlock(*_lockp); \ + *_lockp = NULL; \ + } \ +} while (0) + +#define VM_PAGE_TO_PV_LIST_LOCK(m) \ + PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) + +/* + * We support 52 bits, hence: + * bits 52 - 31 = 21, 0b10101 + * RTS encoding details + * bits 0 - 3 of rts -> bits 6 - 8 unsigned long + * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long + */ +#define RTS_SIZE ((0x2UL << 61) | (0x5UL << 5)) + + +static int powernv_enabled = 1; + +static inline void +tlbiel_radix_set_isa300(uint32_t set, uint32_t is, + uint32_t pid, uint32_t ric, uint32_t prs) +{ + uint64_t rb; + uint64_t rs; + + rb = PPC_BITLSHIFT_VAL(set, 51) | PPC_BITLSHIFT_VAL(is, 53); + rs = PPC_BITLSHIFT_VAL((uint64_t)pid, 31); + + __asm __volatile(PPC_TLBIEL(%0, %1, %2, %3, 1) + : : "r"(rb), "r"(rs), "i"(ric), "i"(prs) + : "memory"); +} + +static void +tlbiel_flush_isa3(uint32_t num_sets, uint32_t is) +{ + uint32_t set; + + __asm __volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and the entire Page Walk Cache + * and partition table entries. Then flush the remaining sets of the + * TLB. + */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); + + /* Do the same for process scoped entries. */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); + + __asm __volatile("ptesync": : :"memory"); +} + +static void +mmu_radix_tlbiel_flush(int scope) +{ + int is; + + MPASS(scope == TLB_INVAL_SCOPE_LPID || + scope == TLB_INVAL_SCOPE_GLOBAL); + is = scope + 2; + + tlbiel_flush_isa3(POWER9_TLB_SETS_RADIX, is); + __asm __volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +static void +mmu_radix_init_amor(void) +{ + /* + * In HV mode, we init AMOR (Authority Mask Override Register) so that + * the hypervisor and guest can setup IAMR (Instruction Authority Mask + * Register), enable key 0 and set it to 1. + * + * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) + */ + mtspr(SPR_AMOR, (3ul << 62)); +} + +static void +mmu_radix_init_iamr(void) +{ + /* + * Radix always uses key0 of the IAMR to determine if an access is + * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction + * fetch. + */ + mtspr(SPR_IAMR, (1ul << 62)); +} + +static void +mmu_radix_pid_set(pmap_t pmap) +{ + + mtspr(SPR_PID, pmap->pm_pid); + isync(); +} + +/* Quick sort callout for comparing physical addresses. */ +static int +pa_cmp(const void *a, const void *b) +{ + const vm_paddr_t *pa = a, *pb = b; + + if (*pa < *pb) + return (-1); + else if (*pa > *pb) + return (1); + else + return (0); +} + +#define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte) +#define pte_load_clear(ptep) atomic_swap_long(ptep, 0) +#define pte_store(ptep, pte) do { \ + MPASS((pte) & (RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_X)); \ + *(u_long *)(ptep) = (u_long)((pte) | PG_V | RPTE_LEAF); \ +} while (0) +/* + * NB: should only be used for adding directories - not for direct mappings + */ +#define pde_store(ptep, pa) do { \ + *(u_long *)(ptep) = (u_long)(pa|RPTE_VALID|RPTE_SHIFT); \ +} while (0) + +#define pte_clear(ptep) do { \ + *(u_long *)(ptep) = (u_long)(0); \ +} while (0) + +#define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */ + +/* + * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB + * (PTE) page mappings have identical settings for the following fields: + */ +#define PG_PTE_PROMOTE (PG_X | PG_MANAGED | PG_W | PG_PTE_CACHE | \ + PG_M | PG_A | RPTE_EAA_MASK | PG_V) + + +static void +pmap_epoch_init(void *arg __unused) +{ + pmap_epoch = epoch_alloc("pmap", EPOCH_PREEMPT | EPOCH_LOCKED); +} +SYSINIT(epoch, SI_SUB_EPOCH + 1, SI_ORDER_ANY, pmap_epoch_init, NULL); + +static bool +pmap_not_in_di(void) +{ + + return (curthread->td_md.md_invl_gen.gen == 0); +} + +#define PMAP_ASSERT_NOT_IN_DI() \ + KASSERT(pmap_not_in_di(), ("DI already started")) + +static void +pmap_delayed_invl_started(epoch_tracker_t et) +{ + epoch_enter_preempt(pmap_epoch, et); + curthread->td_md.md_invl_gen.gen = 1; +} + +static void +pmap_delayed_invl_finished(epoch_tracker_t et) +{ + curthread->td_md.md_invl_gen.gen = 0; + epoch_exit_preempt(pmap_epoch, et); +} + +static void +pmap_delayed_invl_wait(vm_page_t m __unused) +{ + epoch_wait_preempt(pmap_epoch); +} + +static __inline void +pmap_resident_count_inc(pmap_t pmap, int count) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + pmap->pm_stats.resident_count += count; +} + +static __inline void +pmap_resident_count_dec(pmap_t pmap, int count) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT(pmap->pm_stats.resident_count >= count, + ("pmap %p resident count underflow %ld %d", pmap, + pmap->pm_stats.resident_count, count)); + pmap->pm_stats.resident_count -= count; +} + +static void +pagezero(vm_offset_t va) +{ + va = trunc_page(va); + int off; + + for (off = 0; off < PAGE_SIZE; off += cacheline_size) + __asm __volatile("dcbz 0,%0" :: "r"(va + off)); +} + +static uint64_t +allocpages(int n) +{ + u_int64_t ret; + + ret = moea64_bootstrap_alloc(n * PAGE_SIZE, PAGE_SIZE); + for (int i = 0; i < n; i++) + pagezero(PHYS_TO_DMAP(ret + i * PAGE_SIZE)); + return (ret); +} + +static pt_entry_t * +kvtopte(vm_offset_t va) +{ + pt_entry_t *l3e; + + l3e = pmap_pml3e(kernel_pmap, va); + if ((*l3e & RPTE_VALID) == 0) + return (NULL); + return (pmap_l3e_to_pte(l3e, va)); +} + +void +mmu_radix_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa) +{ + pt_entry_t *pte; + + pte = kvtopte(va); + MPASS(pte != NULL); + *pte = pa | RPTE_VALID | RPTE_LEAF | RPTE_EAA_R | RPTE_EAA_W | \ + RPTE_EAA_P | PG_M | PG_A; +} + +boolean_t +mmu_radix_ps_enabled(mmu_t mmu, pmap_t pmap) +{ + return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); +} + +static pt_entry_t * +pmap_nofault_pte(pmap_t pmap, vm_offset_t va, int *is_l3e) +{ + pml3_entry_t *l3e; + pt_entry_t *pte; + + va &= PG_PS_FRAME; + l3e = pmap_pml3e(pmap, va); + if (l3e == NULL || (*l3e & PG_V) == 0) + return (NULL); + + if (*l3e & RPTE_LEAF) { + *is_l3e = 1; + return (l3e); + } + *is_l3e = 0; + va &= PG_FRAME; + pte = pmap_l3e_to_pte(l3e, va); + if (pte == NULL || (*pte & PG_V) == 0) + return (NULL); + return (pte); +} + +int +pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags) +{ + pt_entry_t *pte; + pt_entry_t startpte, origpte, newpte; + vm_page_t m; + int is_l3e; + + startpte = 0; + retry: + if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL) + return (KERN_INVALID_ADDRESS); + origpte = newpte = *pte; + if (startpte == 0) { + startpte = origpte; + if (((flags & VM_PROT_WRITE) && (startpte & PG_M)) || + ((flags & VM_PROT_READ) && (startpte & PG_A))) { + pmap_invalidate_all(pmap); +#ifdef INVARIANTS + if (VERBOSE_PMAP || pmap_logging) + printf("%s(%p, %#lx, %#x) (%#lx) -- invalidate all\n", + __func__, pmap, va, flags, origpte); +#endif + return (KERN_FAILURE); + } + } +#ifdef INVARIANTS + if (VERBOSE_PMAP || pmap_logging) + printf("%s(%p, %#lx, %#x) (%#lx)\n", __func__, pmap, va, + flags, origpte); +#endif + PMAP_LOCK(pmap); + if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL || + *pte != origpte) { + PMAP_UNLOCK(pmap); + return (KERN_FAILURE); + } + m = PHYS_TO_VM_PAGE(newpte & PG_FRAME); + MPASS(m != NULL); + switch (flags) { + case VM_PROT_READ: + if ((newpte & (RPTE_EAA_R|RPTE_EAA_X)) == 0) + goto protfail; + newpte |= PG_A; + vm_page_aflag_set(m, PGA_REFERENCED); + break; + case VM_PROT_WRITE: + if ((newpte & RPTE_EAA_W) == 0) + goto protfail; + if (is_l3e) + goto protfail; + newpte |= PG_M; + vm_page_dirty(m); + break; + case VM_PROT_EXECUTE: + if ((newpte & RPTE_EAA_X) == 0) + goto protfail; + newpte |= PG_A; + vm_page_aflag_set(m, PGA_REFERENCED); + break; + } + + if (!atomic_cmpset_long(pte, origpte, newpte)) + goto retry; + ptesync(); + PMAP_UNLOCK(pmap); + if (startpte == newpte) + return (KERN_FAILURE); + return (0); + protfail: + PMAP_UNLOCK(pmap); + return (KERN_PROTECTION_FAILURE); +} + +/* + * Returns TRUE if the given page is mapped individually or as part of + * a 2mpage. Otherwise, returns FALSE. + */ +boolean_t +mmu_radix_page_is_mapped(mmu_t mmu, vm_page_t m) +{ + struct rwlock *lock; + boolean_t rv; + + if ((m->oflags & VPO_UNMANAGED) != 0) + return (FALSE); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); + rv = !TAILQ_EMPTY(&m->md.pv_list) || + ((m->flags & PG_FICTITIOUS) == 0 && + !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); + rw_runlock(lock); + return (rv); +} + +/* + * Determine the appropriate bits to set in a PTE or PDE for a specified + * caching mode. + */ +static int +pmap_cache_bits(vm_memattr_t ma) +{ + if (ma != VM_MEMATTR_DEFAULT) { + switch (ma) { + case VM_MEMATTR_UNCACHEABLE: + return (RPTE_ATTR_GUARDEDIO); + case VM_MEMATTR_CACHEABLE: + return (RPTE_ATTR_MEM); + case VM_MEMATTR_WRITE_BACK: + case VM_MEMATTR_PREFETCHABLE: + case VM_MEMATTR_WRITE_COMBINING: + return (RPTE_ATTR_UNGUARDEDIO); + } + } + return (0); +} + +static void +pmap_invalidate_page(pmap_t pmap, vm_offset_t start) +{ + ptesync(); + if (pmap == kernel_pmap) + radix_tlbie_invlpg_kernel_4k(start); + else + radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); + ttusync(); +} + +static void +pmap_invalidate_page_2m(pmap_t pmap, vm_offset_t start) +{ + ptesync(); + if (pmap == kernel_pmap) + radix_tlbie_invlpg_kernel_2m(start); + else + radix_tlbie_invlpg_user_2m(pmap->pm_pid, start); + ttusync(); +} + +static void +pmap_invalidate_pwc(pmap_t pmap) +{ + ptesync(); + if (pmap == kernel_pmap) + radix_tlbie_invlpwc_kernel(); + else + radix_tlbie_invlpwc_user(pmap->pm_pid); + ttusync(); +} + +static void +pmap_invalidate_range(pmap_t pmap, vm_offset_t start, vm_offset_t end) +{ + if (((start - end) >> PAGE_SHIFT) > 8) { + pmap_invalidate_all(pmap); + return; + } + ptesync(); + if (pmap == kernel_pmap) { + while (start < end) { + radix_tlbie_invlpg_kernel_4k(start); + start += PAGE_SIZE; + } + } else { + while (start < end) { + radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); + start += PAGE_SIZE; + } + } + ttusync(); +} + +static void +pmap_invalidate_all(pmap_t pmap) +{ + ptesync(); + if (pmap == kernel_pmap) + radix_tlbie_flush_kernel(); + else + radix_tlbie_flush_user(pmap->pm_pid); + ttusync(); +} + +static void +pmap_invalidate_l3e_page(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e) +{ + + /* + * When the PDE has PG_PROMOTED set, the 2MB page mapping was created + * by a promotion that did not invalidate the 512 4KB page mappings + * that might exist in the TLB. Consequently, at this point, the TLB + * may hold both 4KB and 2MB page mappings for the address range [va, + * va + L3_PAGE_SIZE). Therefore, the entire range must be invalidated here. + * In contrast, when PG_PROMOTED is clear, the TLB will not hold any + * 4KB page mappings for the address range [va, va + L3_PAGE_SIZE), and so a + * single INVLPG suffices to invalidate the 2MB page mapping from the + * TLB. + */ + ptesync(); + if ((l3e & PG_PROMOTED) != 0) + pmap_invalidate_range(pmap, va, va + L3_PAGE_SIZE - 1); + else + pmap_invalidate_page_2m(pmap, va); + + pmap_invalidate_pwc(pmap); +} + +static __inline struct pv_chunk * +pv_to_chunk(pv_entry_t pv) +{ + + return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); +} + +#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) + +#define PC_FREE0 0xfffffffffffffffful +#define PC_FREE1 0x3ffffffffffffffful + +static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1 }; + +/* + * Ensure that the number of spare PV entries in the specified pmap meets or + * exceeds the given count, "needed". + * + * The given PV list lock may be released. + */ +static void +reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) +{ + struct pch new_tail; + struct pv_chunk *pc; + vm_page_t m; + int avail, free; + bool reclaimed; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); + + /* + * Newly allocated PV chunks must be stored in a private list until + * the required number of PV chunks have been allocated. Otherwise, + * reclaim_pv_chunk() could recycle one of these chunks. In + * contrast, these chunks must be added to the pmap upon allocation. + */ + TAILQ_INIT(&new_tail); +retry: + avail = 0; + TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { + // if ((cpu_feature2 & CPUID2_POPCNT) == 0) + bit_count((bitstr_t *)pc->pc_map, 0, + sizeof(pc->pc_map) * NBBY, &free); +#if 0 + free = popcnt_pc_map_pq(pc->pc_map); +#endif + if (free == 0) + break; + avail += free; + if (avail >= needed) + break; + } + for (reclaimed = false; avail < needed; avail += _NPCPV) { + m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED); + if (m == NULL) { + m = reclaim_pv_chunk(pmap, lockp); + if (m == NULL) + goto retry; + reclaimed = true; + } + PV_STAT(atomic_add_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); + pc = (void *)PHYS_TO_DMAP(m->phys_addr); + pc->pc_pmap = pmap; + pc->pc_map[0] = PC_FREE0; + pc->pc_map[1] = PC_FREE1; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); + + /* + * The reclaim might have freed a chunk from the current pmap. + * If that chunk contained available entries, we need to + * re-count the number of available entries. + */ + if (reclaimed) + goto retry; + } + if (!TAILQ_EMPTY(&new_tail)) { + mtx_lock(&pv_chunks_mutex); + TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); + mtx_unlock(&pv_chunks_mutex); + } +} + +/* + * First find and then remove the pv entry for the specified pmap and virtual + * address from the specified pv list. Returns the pv entry if found and NULL + * otherwise. This operation can be performed on pv lists for either 4KB or + * 2MB page mappings. + */ +static __inline pv_entry_t +pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { +#ifdef INVARIANTS + if (PV_PMAP(pv) == NULL) { + printf("corrupted pv_chunk/pv %p\n", pv); + printf("pv_chunk: %64D\n", pv_to_chunk(pv), ":"); + } + MPASS(PV_PMAP(pv) != NULL); + MPASS(pv->pv_va != 0); +#endif + if (pmap == PV_PMAP(pv) && va == pv->pv_va) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); + pvh->pv_gen++; + break; + } + } + return (pv); +} + +/* + * After demotion from a 2MB page mapping to 512 4KB page mappings, + * destroy the pv entry for the 2MB page mapping and reinstantiate the pv + * entries for each of the 4KB page mappings. + */ +static void +pmap_pv_demote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp) +{ + struct md_page *pvh; + struct pv_chunk *pc; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + int bit, field; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((pa & L3_PAGE_MASK) == 0, + ("pmap_pv_demote_pde: pa is not 2mpage aligned")); + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + + /* + * Transfer the 2mpage's pv entry for this mapping to the first + * page's pv list. Once this transfer begins, the pv list lock + * must not be released until the last pv entry is reinstantiated. + */ + pvh = pa_to_pvh(pa); + va = trunc_2mpage(va); + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); + m = PHYS_TO_VM_PAGE(pa); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); + + m->md.pv_gen++; + /* Instantiate the remaining NPTEPG - 1 pv entries. */ + PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); + va_last = va + L3_PAGE_SIZE - PAGE_SIZE; + for (;;) { + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 + , ("pmap_pv_demote_pde: missing spare")); + for (field = 0; field < _NPCM; field++) { + while (pc->pc_map[field]) { + bit = cnttzd(pc->pc_map[field]); + pc->pc_map[field] &= ~(1ul << bit); + pv = &pc->pc_pventry[field * 64 + bit]; + va += PAGE_SIZE; + pv->pv_va = va; + m++; + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_pv_demote_pde: page %p is not managed", m)); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); + + m->md.pv_gen++; + if (va == va_last) + goto out; + } + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + } +out: + if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + } + PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); + PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); +} + +static void +reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di, + epoch_tracker_t et) +{ + + if (pmap == NULL) + return; + pmap_invalidate_all(pmap); + if (pmap != locked_pmap) + PMAP_UNLOCK(pmap); + if (start_di) + pmap_delayed_invl_finished(et); +} + +/* + * We are in a serious low memory condition. Resort to + * drastic measures to free some pages so we can allocate + * another pv entry chunk. + * + * Returns NULL if PV entries were reclaimed from the specified pmap. + * + * We do not, however, unmap 2mpages because subsequent accesses will + * allocate per-page pv entries until repromotion occurs, thereby + * exacerbating the shortage of free pv entries. + */ +static int active_reclaims = 0; +static vm_page_t +reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) +{ + struct pv_chunk *pc, *pc_marker, *pc_marker_end; + struct pv_chunk_header pc_marker_b, pc_marker_end_b; + struct md_page *pvh; + pml3_entry_t *l3e; + pmap_t next_pmap, pmap; + pt_entry_t *pte, tpte; + pv_entry_t pv; + vm_offset_t va; + vm_page_t m, m_pc; + struct spglist free; + uint64_t inuse; + int bit, field, freed; + bool start_di; + struct epoch_tracker et; + + PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); + KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); + pmap = NULL; + m_pc = NULL; + SLIST_INIT(&free); + bzero(&pc_marker_b, sizeof(pc_marker_b)); + bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); + pc_marker = (struct pv_chunk *)&pc_marker_b; + pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; + + /* + * A delayed invalidation block should already be active if + * pmap_advise() or pmap_remove() called this function by way + * of pmap_demote_l3e_locked(). + */ + start_di = pmap_not_in_di(); + + mtx_lock(&pv_chunks_mutex); + active_reclaims++; + TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); + TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); + while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && + SLIST_EMPTY(&free)) { + next_pmap = pc->pc_pmap; + if (next_pmap == NULL) { + /* + * The next chunk is a marker. However, it is + * not our marker, so active_reclaims must be + * > 1. Consequently, the next_chunk code + * will not rotate the pv_chunks list. + */ + goto next_chunk; + } + mtx_unlock(&pv_chunks_mutex); + + /* + * A pv_chunk can only be removed from the pc_lru list + * when both pc_chunks_mutex is owned and the + * corresponding pmap is locked. + */ + if (pmap != next_pmap) { + reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, + start_di, &et); + pmap = next_pmap; + /* Avoid deadlock and lock recursion. */ + if (pmap > locked_pmap) { + RELEASE_PV_LIST_LOCK(lockp); + PMAP_LOCK(pmap); + if (start_di) + pmap_delayed_invl_started(&et); + mtx_lock(&pv_chunks_mutex); + continue; + } else if (pmap != locked_pmap) { + if (PMAP_TRYLOCK(pmap)) { + if (start_di) + pmap_delayed_invl_started(&et); + mtx_lock(&pv_chunks_mutex); + continue; + } else { + pmap = NULL; /* pmap is not locked */ + mtx_lock(&pv_chunks_mutex); + pc = TAILQ_NEXT(pc_marker, pc_lru); + if (pc == NULL || + pc->pc_pmap != next_pmap) + continue; + goto next_chunk; + } + } else if (start_di) + pmap_delayed_invl_started(&et); + } + + /* + * Destroy every non-wired, 4 KB page mapping in the chunk. + */ + freed = 0; + for (field = 0; field < _NPCM; field++) { + for (inuse = ~pc->pc_map[field] & pc_freemask[field]; + inuse != 0; inuse &= ~(1UL << bit)) { + bit = cnttzd(inuse); + pv = &pc->pc_pventry[field * 64 + bit]; + va = pv->pv_va; + l3e = pmap_pml3e(pmap, va); + if ((*l3e & RPTE_LEAF) != 0) + continue; + pte = pmap_l3e_to_pte(l3e, va); + if ((*pte & PG_W) != 0) + continue; + tpte = pte_load_clear(pte); + m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if ((tpte & PG_A) != 0) + vm_page_aflag_set(m, PGA_REFERENCED); + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); + + m->md.pv_gen++; + if (TAILQ_EMPTY(&m->md.pv_list) && + (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) { + vm_page_aflag_clear(m, + PGA_WRITEABLE); + } + } + pc->pc_map[field] |= 1UL << bit; + pmap_unuse_pt(pmap, va, *l3e, &free); + freed++; + } + } + if (freed == 0) { + mtx_lock(&pv_chunks_mutex); + goto next_chunk; + } + /* Every freed mapping is for a 4 KB page. */ + pmap_resident_count_dec(pmap, freed); + PV_STAT(atomic_add_long(&pv_entry_frees, freed)); + PV_STAT(atomic_add_int(&pv_entry_spare, freed)); + PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1) { + PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); + PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); + /* Entire chunk is free; return it. */ + m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); + mtx_lock(&pv_chunks_mutex); + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + break; + } + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + mtx_lock(&pv_chunks_mutex); + /* One freed pv entry in locked_pmap is sufficient. */ + if (pmap == locked_pmap) + break; +next_chunk: + TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); + TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); + if (active_reclaims == 1 && pmap != NULL) { + /* + * Rotate the pv chunks list so that we do not + * scan the same pv chunks that could not be + * freed (because they contained a wired + * and/or superpage mapping) on every + * invocation of reclaim_pv_chunk(). + */ + while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { + MPASS(pc->pc_pmap != NULL); + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); + } + } + } + TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); + TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); + active_reclaims--; + mtx_unlock(&pv_chunks_mutex); + reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di, &et); + if (m_pc == NULL && !SLIST_EMPTY(&free)) { + m_pc = SLIST_FIRST(&free); + SLIST_REMOVE_HEAD(&free, plinks.s.ss); + /* Recycle a freed page table page. */ + m_pc->ref_count = 1; + } + vm_page_free_pages_toq(&free, true); + return (m_pc); +} + +/* + * free the pv_entry back to the free list + */ +static void +free_pv_entry(pmap_t pmap, pv_entry_t pv) +{ + struct pv_chunk *pc; + int idx, field, bit; + +#ifdef VERBOSE_PV + if (pmap != kernel_pmap) + printf("%s(%p, %p)\n", __func__, pmap, pv); +#endif + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PV_STAT(atomic_add_long(&pv_entry_frees, 1)); + PV_STAT(atomic_add_int(&pv_entry_spare, 1)); + PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); + pc = pv_to_chunk(pv); + idx = pv - &pc->pc_pventry[0]; + field = idx / 64; + bit = idx % 64; + pc->pc_map[field] |= 1ul << bit; + if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1) { + /* 98% of the time, pc is already at the head of the list. */ + if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + } + return; + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + free_pv_chunk(pc); +} + +static void +free_pv_chunk(struct pv_chunk *pc) +{ + vm_page_t m; + + mtx_lock(&pv_chunks_mutex); + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + mtx_unlock(&pv_chunks_mutex); + PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); + PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); + /* entire chunk is free, return it */ + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); + vm_page_unwire_noq(m); + vm_page_free(m); +} + +/* + * Returns a new PV entry, allocating a new PV chunk from the system when + * needed. If this PV chunk allocation fails and a PV list lock pointer was + * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is + * returned. + * + * The given PV list lock may be released. + */ +static pv_entry_t +get_pv_entry(pmap_t pmap, struct rwlock **lockp) +{ + int bit, field; + pv_entry_t pv; + struct pv_chunk *pc; + vm_page_t m; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); +retry: + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + if (pc != NULL) { + for (field = 0; field < _NPCM; field++) { + if (pc->pc_map[field]) { + bit = cnttzd(pc->pc_map[field]); + break; + } + } + if (field < _NPCM) { + pv = &pc->pc_pventry[field * 64 + bit]; + pc->pc_map[field] &= ~(1ul << bit); + /* If this was the last item, move it to tail */ + if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, + pc_list); + } + PV_STAT(atomic_add_long(&pv_entry_count, 1)); + PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); + MPASS(PV_PMAP(pv) != NULL); + return (pv); + } + } + /* No free items, allocate another chunk */ + m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED); + if (m == NULL) { + if (lockp == NULL) { + PV_STAT(pc_chunk_tryfail++); + return (NULL); + } + m = reclaim_pv_chunk(pmap, lockp); + if (m == NULL) + goto retry; + } + PV_STAT(atomic_add_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); + pc = (void *)PHYS_TO_DMAP(m->phys_addr); + pc->pc_pmap = pmap; + pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ + pc->pc_map[1] = PC_FREE1; + mtx_lock(&pv_chunks_mutex); + TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); + mtx_unlock(&pv_chunks_mutex); + pv = &pc->pc_pventry[0]; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + PV_STAT(atomic_add_long(&pv_entry_count, 1)); + PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); + MPASS(PV_PMAP(pv) != NULL); + return (pv); +} + +#if VM_NRESERVLEVEL > 0 +/* + * After promotion from 512 4KB page mappings to a single 2MB page mapping, + * replace the many pv entries for the 4KB page mappings by a single pv entry + * for the 2MB page mapping. + */ +static void +pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + + KASSERT((pa & L3_PAGE_MASK) == 0, + ("pmap_pv_promote_pde: pa is not 2mpage aligned")); + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + + /* + * Transfer the first page's pv entry for this mapping to the 2mpage's + * pv list. Aside from avoiding the cost of a call to get_pv_entry(), + * a transfer avoids the possibility that get_pv_entry() calls + * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the + * mappings that is being promoted. + */ + m = PHYS_TO_VM_PAGE(pa); + va = trunc_2mpage(va); + pv = pmap_pvh_remove(&m->md, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); + pvh->pv_gen++; + /* Free the remaining NPTEPG - 1 pv entries. */ + va_last = va + L3_PAGE_SIZE - PAGE_SIZE; + do { + m++; + va += PAGE_SIZE; + pmap_pvh_free(&m->md, pmap, va); + } while (va < va_last); +} +#endif /* VM_NRESERVLEVEL > 0 */ + +/* + * First find and then destroy the pv entry for the specified pmap and virtual + * address. This operation can be performed on pv lists for either 4KB or 2MB + * page mappings. + */ +static void +pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); + free_pv_entry(pmap, pv); +} + +/* + * Conditionally create the PV entry for a 4KB page mapping if the required + * memory can be allocated without resorting to reclamation. + */ +static boolean_t +pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, + struct rwlock **lockp) +{ + pv_entry_t pv; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* Pass NULL instead of the lock pointer to disable reclamation. */ + if ((pv = get_pv_entry(pmap, NULL)) != NULL) { + pv->pv_va = va; + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); + m->md.pv_gen++; + return (TRUE); + } else + return (FALSE); +} + +vm_paddr_t phys_avail_debug[2 * VM_PHYSSEG_MAX]; +#ifdef INVARIANTS +static void +validate_addr(vm_paddr_t addr, vm_size_t size) +{ + vm_paddr_t end = addr + size; + bool found = false; + + for (int i = 0; i < 2 * phys_avail_count; i += 2) { + if (addr >= phys_avail_debug[i] && + end <= phys_avail_debug[i + 1]) { + found = true; + break; + } + } + KASSERT(found, ("%#lx-%#lx outside of initial phys_avail array", + addr, end)); +} +#else +static void validate_addr(vm_paddr_t addr, vm_size_t size) {} +#endif +#define DMAP_PAGE_BITS (RPTE_VALID | RPTE_LEAF | RPTE_EAA_MASK | PG_M | PG_A) + +static vm_paddr_t +alloc_pt_page(void) +{ + vm_paddr_t page; + + page = allocpages(1); + pagezero(PHYS_TO_DMAP(page)); + return (page); +} + +static void +mmu_radix_dmap_range(vm_paddr_t start, vm_paddr_t end) +{ + pt_entry_t *pte, pteval; + vm_paddr_t page; + + if (bootverbose) + printf("%s %lx -> %lx\n", __func__, start, end); + while (start < end) { + pteval = start | DMAP_PAGE_BITS; + pte = pmap_pml1e(kernel_pmap, PHYS_TO_DMAP(start)); + if ((*pte & RPTE_VALID) == 0) { + page = alloc_pt_page(); + pde_store(pte, page); + } + pte = pmap_l1e_to_l2e(pte, PHYS_TO_DMAP(start)); + if ((start & L2_PAGE_MASK) == 0 && + end - start >= L2_PAGE_SIZE) { + start += L2_PAGE_SIZE; + goto done; + } else if ((*pte & RPTE_VALID) == 0) { + page = alloc_pt_page(); + pde_store(pte, page); + } + + pte = pmap_l2e_to_l3e(pte, PHYS_TO_DMAP(start)); + if ((start & L3_PAGE_MASK) == 0 && + end - start >= L3_PAGE_SIZE) { + start += L3_PAGE_SIZE; + goto done; + } else if ((*pte & RPTE_VALID) == 0) { + page = alloc_pt_page(); + pde_store(pte, page); + } + pte = pmap_l3e_to_pte(pte, PHYS_TO_DMAP(start)); + start += PAGE_SIZE; + done: + pte_store(pte, pteval); + } +} + +static void +mmu_radix_dmap_populate(vm_size_t hwphyssz) +{ + vm_paddr_t start, end; + + for (int i = 0; i < pregions_sz; i++) { + start = pregions[i].mr_start; + end = start + pregions[i].mr_size; + if (hwphyssz && start >= hwphyssz) + break; + if (hwphyssz && hwphyssz < end) + end = hwphyssz; + mmu_radix_dmap_range(start, end); + } +} + +static void +mmu_radix_setup_pagetables(vm_size_t hwphyssz) +{ + vm_paddr_t ptpages, pages; + pt_entry_t *pte; + vm_paddr_t l1phys; + + bzero(kernel_pmap, sizeof(struct pmap)); + PMAP_LOCK_INIT(kernel_pmap); + + ptpages = allocpages(2); + l1phys = moea64_bootstrap_alloc(RADIX_PGD_SIZE, RADIX_PGD_SIZE); + validate_addr(l1phys, RADIX_PGD_SIZE); + if (bootverbose) + printf("l1phys=%lx\n", l1phys); + MPASS((l1phys & (RADIX_PGD_SIZE-1)) == 0); + for (int i = 0; i < RADIX_PGD_SIZE/PAGE_SIZE; i++) + pagezero(PHYS_TO_DMAP(l1phys + i * PAGE_SIZE)); + kernel_pmap->pm_pml1 = (pml1_entry_t *)PHYS_TO_DMAP(l1phys); + + mmu_radix_dmap_populate(hwphyssz); + + /* + * Create page tables for first 128MB of KVA + */ + pages = ptpages; + pte = pmap_pml1e(kernel_pmap, VM_MIN_KERNEL_ADDRESS); + *pte = (pages | RPTE_VALID | RPTE_SHIFT); + pages += PAGE_SIZE; + pte = pmap_l1e_to_l2e(pte, VM_MIN_KERNEL_ADDRESS); + *pte = (pages | RPTE_VALID | RPTE_SHIFT); + pages += PAGE_SIZE; + pte = pmap_l2e_to_l3e(pte, VM_MIN_KERNEL_ADDRESS); + /* + * the kernel page table pages need to be preserved in + * phys_avail and not overlap with previous allocations + */ + pages = allocpages(nkpt); + if (bootverbose) { + printf("phys_avail after dmap populate and nkpt allocation\n"); + for (int j = 0; j < 2 * phys_avail_count; j+=2) + printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", + j, phys_avail[j], j + 1, phys_avail[j + 1]); + } + KPTphys = pages; + for (int i = 0; i < nkpt; i++, pte++, pages += PAGE_SIZE) + *pte = (pages | RPTE_VALID | RPTE_SHIFT); + kernel_vm_end = VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE; + if (bootverbose) + printf("kernel_pmap pml1 %p\n", kernel_pmap->pm_pml1); + /* + * Add a physical memory segment (vm_phys_seg) corresponding to the + * preallocated kernel page table pages so that vm_page structures + * representing these pages will be created. The vm_page structures + * are required for promotion of the corresponding kernel virtual + * addresses to superpage mappings. + */ + vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); +} + +static void +mmu_radix_early_bootstrap(vm_offset_t start, vm_offset_t end) +{ + vm_paddr_t kpstart, kpend; + vm_size_t physsz, hwphyssz; + //uint64_t l2virt; + int rm_pavail, proctab_size; + int i, j; + + kpstart = start & ~DMAP_BASE_ADDRESS; + kpend = end & ~DMAP_BASE_ADDRESS; + + /* Get physical memory regions from firmware */ + mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); + CTR0(KTR_PMAP, "mmu_radix_early_bootstrap: physical memory"); + + if (2 * VM_PHYSSEG_MAX < regions_sz) + panic("mmu_radix_early_bootstrap: phys_avail too small"); + + if (bootverbose) + for (int i = 0; i < regions_sz; i++) + printf("regions[%d].mr_start=%lx regions[%d].mr_size=%lx\n", + i, regions[i].mr_start, i, regions[i].mr_size); + /* + * XXX workaround a simulator bug + */ + for (int i = 0; i < regions_sz; i++) + if (regions[i].mr_start & PAGE_MASK) { + regions[i].mr_start += PAGE_MASK; + regions[i].mr_start &= ~PAGE_MASK; + regions[i].mr_size &= ~PAGE_MASK; + } + if (bootverbose) + for (int i = 0; i < pregions_sz; i++) + printf("pregions[%d].mr_start=%lx pregions[%d].mr_size=%lx\n", + i, pregions[i].mr_start, i, pregions[i].mr_size); + + phys_avail_count = 0; + physsz = 0; + hwphyssz = 0; + TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); + for (i = 0, j = 0; i < regions_sz; i++) { + if (bootverbose) + printf("regions[%d].mr_start=%016lx regions[%d].mr_size=%016lx\n", + i, regions[i].mr_start, i, regions[i].mr_size); + + if (regions[i].mr_size < PAGE_SIZE) + continue; + + if (hwphyssz != 0 && + (physsz + regions[i].mr_size) >= hwphyssz) { + if (physsz < hwphyssz) { + phys_avail[j] = regions[i].mr_start; + phys_avail[j + 1] = regions[i].mr_start + + (hwphyssz - physsz); + physsz = hwphyssz; + phys_avail_count++; + dump_avail[j] = phys_avail[j]; + dump_avail[j + 1] = phys_avail[j + 1]; + } + break; + } + phys_avail[j] = regions[i].mr_start; + phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; + dump_avail[j] = phys_avail[j]; + dump_avail[j + 1] = phys_avail[j + 1]; + + phys_avail_count++; + physsz += regions[i].mr_size; + j += 2; + } + + /* Check for overlap with the kernel and exception vectors */ + rm_pavail = 0; + for (j = 0; j < 2 * phys_avail_count; j+=2) { + if (phys_avail[j] < EXC_LAST) + phys_avail[j] += EXC_LAST; + + if (phys_avail[j] >= kpstart && + phys_avail[j + 1] <= kpend) { + phys_avail[j] = phys_avail[j + 1] = ~0; + rm_pavail++; + continue; + } + + if (kpstart >= phys_avail[j] && + kpstart < phys_avail[j + 1]) { + if (kpend < phys_avail[j + 1]) { + phys_avail[2 * phys_avail_count] = + (kpend & ~PAGE_MASK) + PAGE_SIZE; + phys_avail[2 * phys_avail_count + 1] = + phys_avail[j + 1]; + phys_avail_count++; + } + + phys_avail[j + 1] = kpstart & ~PAGE_MASK; + } + + if (kpend >= phys_avail[j] && + kpend < phys_avail[j + 1]) { + if (kpstart > phys_avail[j]) { + phys_avail[2 * phys_avail_count] = phys_avail[j]; + phys_avail[2 * phys_avail_count + 1] = + kpstart & ~PAGE_MASK; + phys_avail_count++; + } + + phys_avail[j] = (kpend & ~PAGE_MASK) + + PAGE_SIZE; + } + } + qsort(phys_avail, 2 * phys_avail_count, sizeof(phys_avail[0]), pa_cmp); + for (i = 0; i < 2 * phys_avail_count; i++) + phys_avail_debug[i] = phys_avail[i]; + + /* Remove physical available regions marked for removal (~0) */ + if (rm_pavail) { + phys_avail_count -= rm_pavail; + for (i = 2 * phys_avail_count; + i < 2*(phys_avail_count + rm_pavail); i+=2) + phys_avail[i] = phys_avail[i + 1] = 0; + } + if (bootverbose) { + printf("phys_avail ranges after filtering:\n"); + for (j = 0; j < 2 * phys_avail_count; j+=2) + printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", + j, phys_avail[j], j + 1, phys_avail[j + 1]); + } + physmem = btoc(physsz); + + /* XXX assume we're running non-virtualized and + * we don't support BHYVE + */ + if (isa3_pid_bits == 0) + isa3_pid_bits = 20; + parttab_phys = moea64_bootstrap_alloc(PARTTAB_SIZE, PARTTAB_SIZE); + validate_addr(parttab_phys, PARTTAB_SIZE); + for (int i = 0; i < PARTTAB_SIZE/PAGE_SIZE; i++) + pagezero(PHYS_TO_DMAP(parttab_phys + i * PAGE_SIZE)); + + proctab_size = 1UL << PROCTAB_SIZE_SHIFT; + proctab0pa = moea64_bootstrap_alloc(proctab_size, proctab_size); + validate_addr(proctab0pa, proctab_size); + for (int i = 0; i < proctab_size/PAGE_SIZE; i++) + pagezero(PHYS_TO_DMAP(proctab0pa + i * PAGE_SIZE)); + + mmu_radix_setup_pagetables(hwphyssz); +} + +static void +mmu_radix_late_bootstrap(mmu_t mmu, vm_offset_t start, vm_offset_t end) +{ + int i; + vm_paddr_t pa; + void *dpcpu; + vm_offset_t va; + + /* + * Set up the Open Firmware pmap and add its mappings if not in real + * mode. + */ + if (bootverbose) + printf("%s enter\n", __func__); + + /* + * Calculate the last available physical address, and reserve the + * vm_page_array (upper bound). + */ + Maxmem = 0; + for (i = 0; phys_avail[i + 2] != 0; i += 2) + Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1])); + + /* + * Set the start and end of kva. + */ + virtual_avail = VM_MIN_KERNEL_ADDRESS; + virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; + + /* + * Remap any early IO mappings (console framebuffer, etc.) + */ + bs_remap_earlyboot(); + + /* + * Allocate a kernel stack with a guard page for thread0 and map it + * into the kernel page map. + */ + pa = allocpages(kstack_pages); + va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; + virtual_avail = va + kstack_pages * PAGE_SIZE; + CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); + thread0.td_kstack = va; + for (i = 0; i < kstack_pages; i++) { + mmu_radix_kenter(mmu, va, pa); + pa += PAGE_SIZE; + va += PAGE_SIZE; + } + thread0.td_kstack_pages = kstack_pages; + + /* + * Allocate virtual address space for the message buffer. + */ + pa = msgbuf_phys = allocpages((msgbufsize + PAGE_MASK) >> PAGE_SHIFT); + msgbufp = (struct msgbuf *)PHYS_TO_DMAP(pa); + + /* + * Allocate virtual address space for the dynamic percpu area. + */ + pa = allocpages(DPCPU_SIZE >> PAGE_SHIFT); + dpcpu = (void *)PHYS_TO_DMAP(pa); + dpcpu_init(dpcpu, curcpu); + /* + * Reserve some special page table entries/VA space for temporary + * mapping of pages. + */ +} + +static void +mmu_parttab_init(void) +{ + uint64_t ptcr; + + isa3_parttab = (struct pate *)PHYS_TO_DMAP(parttab_phys); + + if (bootverbose) + printf("%s parttab: %p\n", __func__, isa3_parttab); + ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); + if (bootverbose) + printf("setting ptcr %lx\n", ptcr); + mtspr(SPR_PTCR, ptcr); +} + +static void +mmu_parttab_update(uint64_t lpid, uint64_t pagetab, uint64_t proctab) +{ + uint64_t prev; + + if (bootverbose) + printf("%s isa3_parttab %p lpid %lx pagetab %lx proctab %lx\n", __func__, isa3_parttab, + lpid, pagetab, proctab); + prev = be64toh(isa3_parttab[lpid].pagetab); + isa3_parttab[lpid].pagetab = htobe64(pagetab); + isa3_parttab[lpid].proctab = htobe64(proctab); + + if (prev & PARTTAB_HR) { + __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + } else { + __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + } + ttusync(); +} + +static void +mmu_radix_parttab_init(void) +{ + uint64_t pagetab; + + mmu_parttab_init(); + pagetab = RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | \ + RADIX_PGD_INDEX_SHIFT | PARTTAB_HR; + mmu_parttab_update(0, pagetab, 0); +} + +static void +mmu_radix_proctab_register(vm_paddr_t proctabpa, uint64_t table_size) +{ + uint64_t pagetab, proctab; + + pagetab = be64toh(isa3_parttab[0].pagetab); + proctab = proctabpa | table_size | PARTTAB_GR; + mmu_parttab_update(0, pagetab, proctab); +} + +static void +mmu_radix_proctab_init(void) +{ + + isa3_base_pid = 1; + + isa3_proctab = (void*)PHYS_TO_DMAP(proctab0pa); + isa3_proctab->proctab0 = + htobe64(RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | + RADIX_PGD_INDEX_SHIFT); + + mmu_radix_proctab_register(proctab0pa, PROCTAB_SIZE_SHIFT - 12); + + __asm __volatile("ptesync" : : : "memory"); + __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); + __asm __volatile("eieio; tlbsync; ptesync" : : : "memory"); + if (bootverbose) + printf("process table %p and kernel radix PDE: %p\n", + isa3_proctab, kernel_pmap->pm_pml1); + mtmsr(mfmsr() | PSL_DR ); + mtmsr(mfmsr() & ~PSL_DR); + kernel_pmap->pm_pid = isa3_base_pid; + isa3_base_pid++; +} + +void +mmu_radix_advise(mmu_t mmu, pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + int advice) +{ + struct rwlock *lock; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t oldl3e, *l3e; + pt_entry_t *pte; + vm_offset_t va, va_next; + vm_page_t m; + boolean_t anychanged; + struct epoch_tracker et; + + if (advice != MADV_DONTNEED && advice != MADV_FREE) + return; + anychanged = FALSE; + pmap_delayed_invl_started(&et); + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + l1e = pmap_pml1e(pmap, sva); + if ((*l1e & PG_V) == 0) { + va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + l2e = pmap_l1e_to_l2e(l1e, sva); + if ((*l2e & PG_V) == 0) { + va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (va_next < sva) + va_next = eva; + l3e = pmap_l2e_to_l3e(l2e, sva); + oldl3e = *l3e; + if ((oldl3e & PG_V) == 0) + continue; + else if ((oldl3e & RPTE_LEAF) != 0) { + if ((oldl3e & PG_MANAGED) == 0) + continue; + lock = NULL; + if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) { + if (lock != NULL) + rw_wunlock(lock); + + /* + * The large page mapping was destroyed. + */ + continue; + } + + /* + * Unless the page mappings are wired, remove the + * mapping to a single page so that a subsequent + * access may repromote. Since the underlying page + * table page is fully populated, this removal never + * frees a page table page. + */ + if ((oldl3e & PG_W) == 0) { + pte = pmap_l3e_to_pte(l3e, sva); + KASSERT((*pte & PG_V) != 0, + ("pmap_advise: invalid PTE")); + pmap_remove_pte(pmap, pte, sva, *l3e, NULL, + &lock); + anychanged = TRUE; + } + if (lock != NULL) + rw_wunlock(lock); + } + if (va_next > eva) + va_next = eva; + va = va_next; + for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; + pte++, sva += PAGE_SIZE) { + MPASS(pte == pmap_pte(pmap, sva)); + + if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) + goto maybe_invlrng; + else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + if (advice == MADV_DONTNEED) { + /* + * Future calls to pmap_is_modified() + * can be avoided by making the page + * dirty now. + */ + m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); + vm_page_dirty(m); + } + atomic_clear_long(pte, PG_M | PG_A); + } else if ((*pte & PG_A) != 0) + atomic_clear_long(pte, PG_A); + else + goto maybe_invlrng; + anychanged = TRUE; + continue; +maybe_invlrng: + if (va != va_next) { + anychanged = true; + va = va_next; + } + } + if (va != va_next) + anychanged = true; + } + if (anychanged) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); + pmap_delayed_invl_finished(&et); +} + +/* + * Routines used in machine-dependent code + */ +static void +mmu_radix_bootstrap(mmu_t mmu, vm_offset_t start, vm_offset_t end) +{ + uint64_t lpcr; + + if (bootverbose) + printf("%s\n", __func__); + hw_direct_map = 1; + mmu_radix_early_bootstrap(start, end); + if (bootverbose) + printf("early bootstrap complete\n"); + if (powernv_enabled) { + lpcr = mfspr(SPR_LPCR); + mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); + mmu_radix_parttab_init(); + mmu_radix_init_amor(); + if (bootverbose) + printf("powernv init complete\n"); + } + mmu_radix_init_iamr(); + mmu_radix_proctab_init(); + mmu_radix_pid_set(kernel_pmap); + /* XXX assume CPU_FTR_HVMODE */ + mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); + + mmu_radix_late_bootstrap(mmu, start, end); + numa_mem_regions(&numa_pregions, &numa_pregions_sz); + if (bootverbose) + printf("%s done\n", __func__); + pmap_bootstrapped = 1; + dmaplimit = roundup2(powerpc_ptob(Maxmem), L2_PAGE_SIZE); +} + +static void +mmu_radix_cpu_bootstrap(mmu_t mmu, int ap) +{ + uint64_t lpcr; + uint64_t ptcr; + + if (powernv_enabled) { + lpcr = mfspr(SPR_LPCR); + mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); + + ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); + mtspr(SPR_PTCR, ptcr); + mmu_radix_init_amor(); + } + mmu_radix_init_iamr(); + mmu_radix_pid_set(kernel_pmap); + mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); +} + +static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3e, CTLFLAG_RD, 0, + "2MB page mapping counters"); + +static u_long pmap_l3e_demotions; +SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, demotions, CTLFLAG_RD, + &pmap_l3e_demotions, 0, "2MB page demotions"); + +static u_long pmap_l3e_mappings; +SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, mappings, CTLFLAG_RD, + &pmap_l3e_mappings, 0, "2MB page mappings"); + +static u_long pmap_l3e_p_failures; +SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, p_failures, CTLFLAG_RD, + &pmap_l3e_p_failures, 0, "2MB page promotion failures"); + +static u_long pmap_l3e_promotions; +SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, promotions, CTLFLAG_RD, + &pmap_l3e_promotions, 0, "2MB page promotions"); + +static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2e, CTLFLAG_RD, 0, + "1GB page mapping counters"); + +static u_long pmap_l2e_demotions; +SYSCTL_ULONG(_vm_pmap_l2e, OID_AUTO, demotions, CTLFLAG_RD, + &pmap_l2e_demotions, 0, "1GB page demotions"); + +void +mmu_radix_clear_modify(mmu_t mmu, vm_page_t m) +{ + struct md_page *pvh; + pmap_t pmap; + pv_entry_t next_pv, pv; + pml3_entry_t oldl3e, *l3e; + pt_entry_t oldpte, *pte; + struct rwlock *lock; + vm_offset_t va; + int md_gen, pvh_gen; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_clear_modify: page %p is not managed", m)); + vm_page_assert_busied(m); + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + + /* + * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. + * If the object containing the page is locked and the page is not + * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. + */ + if ((m->a.flags & PGA_WRITEABLE) == 0) + return; + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : + pa_to_pvh(VM_PAGE_TO_PHYS(m)); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_wlock(lock); +restart: + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + va = pv->pv_va; + l3e = pmap_pml3e(pmap, va); + oldl3e = *l3e; + if ((oldl3e & PG_RW) != 0) { + if (pmap_demote_l3e_locked(pmap, l3e, va, &lock)) { + if ((oldl3e & PG_W) == 0) { + /* + * Write protect the mapping to a + * single page so that a subsequent + * write access may repromote. + */ + va += VM_PAGE_TO_PHYS(m) - (oldl3e & + PG_PS_FRAME); + pte = pmap_l3e_to_pte(l3e, va); + oldpte = *pte; + if ((oldpte & PG_V) != 0) { + while (!atomic_cmpset_long(pte, + oldpte, + (oldpte | RPTE_EAA_R) & ~(PG_M | PG_RW))) + oldpte = *pte; + vm_page_dirty(m); + pmap_invalidate_page(pmap, va); + } + } + } + } + PMAP_UNLOCK(pmap); + } + TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + l3e = pmap_pml3e(pmap, pv->pv_va); + KASSERT((*l3e & RPTE_LEAF) == 0, ("pmap_clear_modify: found" + " a 2mpage in page %p's pv list", m)); + pte = pmap_l3e_to_pte(l3e, pv->pv_va); + if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + atomic_clear_long(pte, PG_M); + pmap_invalidate_page(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + rw_wunlock(lock); +} + +void +mmu_radix_copy(mmu_t mmu, pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, + vm_size_t len, vm_offset_t src_addr) +{ + struct rwlock *lock; + struct spglist free; + vm_offset_t addr; + vm_offset_t end_addr = src_addr + len; + vm_offset_t va_next; + vm_page_t dst_pdpg, dstmpte, srcmpte; + bool invalidate_all; + + CTR6(KTR_PMAP, + "%s(dst_pmap=%p, src_pmap=%p, dst_addr=%lx, len=%lu, src_addr=%lx)\n", + __func__, dst_pmap, src_pmap, dst_addr, len, src_addr); + + if (dst_addr != src_addr) + return; + lock = NULL; + invalidate_all = false; + if (dst_pmap < src_pmap) { + PMAP_LOCK(dst_pmap); + PMAP_LOCK(src_pmap); + } else { + PMAP_LOCK(src_pmap); + PMAP_LOCK(dst_pmap); + } + + for (addr = src_addr; addr < end_addr; addr = va_next) { + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t srcptepaddr, *l3e; + pt_entry_t *src_pte, *dst_pte; + + l1e = pmap_pml1e(src_pmap, addr); + if ((*l1e & PG_V) == 0) { + va_next = (addr + L1_PAGE_SIZE) & ~L1_PAGE_MASK; + if (va_next < addr) + va_next = end_addr; + continue; + } + + l2e = pmap_l1e_to_l2e(l1e, addr); + if ((*l2e & PG_V) == 0) { + va_next = (addr + L2_PAGE_SIZE) & ~L2_PAGE_MASK; + if (va_next < addr) + va_next = end_addr; + continue; + } + + va_next = (addr + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (va_next < addr) + va_next = end_addr; + + l3e = pmap_l2e_to_l3e(l2e, addr); + srcptepaddr = *l3e; + if (srcptepaddr == 0) + continue; + + if (srcptepaddr & RPTE_LEAF) { + if ((addr & L3_PAGE_MASK) != 0 || + addr + L3_PAGE_SIZE > end_addr) + continue; + dst_pdpg = pmap_allocl3e(dst_pmap, addr, NULL); + if (dst_pdpg == NULL) + break; + l3e = (pml3_entry_t *) + PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); + l3e = &l3e[pmap_pml3e_index(addr)]; + if (*l3e == 0 && ((srcptepaddr & PG_MANAGED) == 0 || + pmap_pv_insert_l3e(dst_pmap, addr, srcptepaddr, + PMAP_ENTER_NORECLAIM, &lock))) { + *l3e = srcptepaddr & ~PG_W; + pmap_resident_count_inc(dst_pmap, + L3_PAGE_SIZE / PAGE_SIZE); + atomic_add_long(&pmap_l3e_mappings, 1); + } else + dst_pdpg->ref_count--; + continue; + } + + srcptepaddr &= PG_FRAME; + srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); + KASSERT(srcmpte->ref_count > 0, + ("pmap_copy: source page table page is unused")); + + if (va_next > end_addr) + va_next = end_addr; + + src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); + src_pte = &src_pte[pmap_pte_index(addr)]; + dstmpte = NULL; + while (addr < va_next) { + pt_entry_t ptetemp; + ptetemp = *src_pte; + /* + * we only virtual copy managed pages + */ + if ((ptetemp & PG_MANAGED) != 0) { + if (dstmpte != NULL && + dstmpte->pindex == pmap_l3e_pindex(addr)) + dstmpte->ref_count++; + else if ((dstmpte = pmap_allocpte(dst_pmap, + addr, NULL)) == NULL) + goto out; + dst_pte = (pt_entry_t *) + PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); + dst_pte = &dst_pte[pmap_pte_index(addr)]; + if (*dst_pte == 0 && + pmap_try_insert_pv_entry(dst_pmap, addr, + PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), + &lock)) { + /* + * Clear the wired, modified, and + * accessed (referenced) bits + * during the copy. + */ + *dst_pte = ptetemp & ~(PG_W | PG_M | + PG_A); + pmap_resident_count_inc(dst_pmap, 1); + } else { + SLIST_INIT(&free); + if (pmap_unwire_ptp(dst_pmap, addr, + dstmpte, &free)) { + /* + * Although "addr" is not + * mapped, paging-structure + * caches could nonetheless + * have entries that refer to + * the freed page table pages. + * Invalidate those entries. + */ + invalidate_all = true; + vm_page_free_pages_toq(&free, + true); + } + goto out; + } + if (dstmpte->ref_count >= srcmpte->ref_count) + break; + } + addr += PAGE_SIZE; + if (__predict_false((addr & L3_PAGE_MASK) == 0)) + src_pte = pmap_pte(src_pmap, addr); + else + src_pte++; + } + } +out: + if (invalidate_all) + pmap_invalidate_all(dst_pmap); + if (lock != NULL) + rw_wunlock(lock); + PMAP_UNLOCK(src_pmap); + PMAP_UNLOCK(dst_pmap); +} + +static void +mmu_radix_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst) +{ + vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); + vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); + + CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst); + /* + * XXX slow + */ + bcopy((void *)src, (void *)dst, PAGE_SIZE); +} + +static void +mmu_radix_copy_pages(mmu_t mmu, vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + + CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma, + a_offset, mb, b_offset, xfersize); + UNIMPLEMENTED(); +} + +#if VM_NRESERVLEVEL > 0 +/* + * Tries to promote the 512, contiguous 4KB page mappings that are within a + * single page table page (PTP) to a single 2MB page mapping. For promotion + * to occur, two conditions must be met: (1) the 4KB page mappings must map + * aligned, contiguous physical memory and (2) the 4KB page mappings must have + * identical characteristics. + */ +static int +pmap_promote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va, + struct rwlock **lockp) +{ + pml3_entry_t newpde; + pt_entry_t *firstpte, oldpte, pa, *pte; + vm_page_t mpte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * Examine the first PTE in the specified PTP. Abort if this PTE is + * either invalid, unused, or does not map the first 4KB physical page + * within a 2MB page. + */ + firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); +setpde: + newpde = *firstpte; + if ((newpde & ((PG_FRAME & L3_PAGE_MASK) | PG_A | PG_V)) != (PG_A | PG_V)) { + CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" + " in pmap %p", va, pmap); + goto fail; + } + if ((newpde & (PG_M | PG_RW)) == PG_RW) { + /* + * When PG_M is already clear, PG_RW can be cleared without + * a TLB invalidation. + */ + if (!atomic_cmpset_long(firstpte, newpde, (newpde | RPTE_EAA_R) & ~RPTE_EAA_W)) + goto setpde; + newpde &= ~RPTE_EAA_W; + } + + /* + * Examine each of the other PTEs in the specified PTP. Abort if this + * PTE maps an unexpected 4KB physical page or does not have identical + * characteristics to the first PTE. + */ + pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + L3_PAGE_SIZE - PAGE_SIZE; + for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { +setpte: + oldpte = *pte; + if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { + CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" + " in pmap %p", va, pmap); + goto fail; + } + if ((oldpte & (PG_M | PG_RW)) == PG_RW) { + /* + * When PG_M is already clear, PG_RW can be cleared + * without a TLB invalidation. + */ + if (!atomic_cmpset_long(pte, oldpte, (oldpte | RPTE_EAA_R) & ~RPTE_EAA_W)) + goto setpte; + oldpte &= ~RPTE_EAA_W; + CTR2(KTR_PMAP, "pmap_promote_l3e: protect for va %#lx" + " in pmap %p", (oldpte & PG_FRAME & L3_PAGE_MASK) | + (va & ~L3_PAGE_MASK), pmap); + } + if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { + CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" + " in pmap %p", va, pmap); + goto fail; + } + pa -= PAGE_SIZE; + } + + /* + * Save the page table page in its current state until the PDE + * mapping the superpage is demoted by pmap_demote_pde() or + * destroyed by pmap_remove_pde(). + */ + mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); + KASSERT(mpte >= vm_page_array && + mpte < &vm_page_array[vm_page_array_size], + ("pmap_promote_l3e: page table page is out of range")); + KASSERT(mpte->pindex == pmap_l3e_pindex(va), + ("pmap_promote_l3e: page table page's pindex is wrong")); + if (pmap_insert_pt_page(pmap, mpte)) { + CTR2(KTR_PMAP, + "pmap_promote_l3e: failure for va %#lx in pmap %p", va, + pmap); + goto fail; + } + + /* + * Promote the pv entries. + */ + if ((newpde & PG_MANAGED) != 0) + pmap_pv_promote_l3e(pmap, va, newpde & PG_PS_FRAME, lockp); + + pte_store(pde, PG_PROMOTED | newpde); + atomic_add_long(&pmap_l3e_promotions, 1); + CTR2(KTR_PMAP, "pmap_promote_l3e: success for va %#lx" + " in pmap %p", va, pmap); + return (0); + fail: + atomic_add_long(&pmap_l3e_p_failures, 1); + return (KERN_FAILURE); +} +#endif /* VM_NRESERVLEVEL > 0 */ + +int +mmu_radix_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, u_int flags, int8_t psind) +{ + struct rwlock *lock; + pml3_entry_t *l3e; + pt_entry_t *pte; + pt_entry_t newpte, origpte; + pv_entry_t pv; + vm_paddr_t opa, pa; + vm_page_t mpte, om; + int rv, retrycount; + boolean_t nosleep, invalidate_all, invalidate_page; + + va = trunc_page(va); + retrycount = 0; + invalidate_page = invalidate_all = false; + CTR6(KTR_PMAP, "pmap_enter(%p, %#lx, %p, %#x, %#x, %d)", pmap, va, + m, prot, flags, psind); + KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); + KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || + va >= kmi.clean_eva, + ("pmap_enter: managed mapping within the clean submap")); + if ((m->oflags & VPO_UNMANAGED) == 0) + VM_PAGE_OBJECT_BUSY_ASSERT(m); + + KASSERT((flags & PMAP_ENTER_RESERVED) == 0, + ("pmap_enter: flags %u has reserved bits set", flags)); + pa = VM_PAGE_TO_PHYS(m); + newpte = (pt_entry_t)(pa | PG_A | PG_V | RPTE_LEAF); + if ((flags & VM_PROT_WRITE) != 0) + newpte |= PG_M; + if ((flags & VM_PROT_READ) != 0) + newpte |= PG_A; + if (prot & VM_PROT_READ) + newpte |= RPTE_EAA_R; + if ((prot & VM_PROT_WRITE) != 0) + newpte |= RPTE_EAA_W; + KASSERT((newpte & (PG_M | PG_RW)) != PG_M, + ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); + + if (prot & VM_PROT_EXECUTE) + newpte |= PG_X; + if ((flags & PMAP_ENTER_WIRED) != 0) + newpte |= PG_W; + if (va >= DMAP_MIN_ADDRESS) + newpte |= RPTE_EAA_P; + newpte |= pmap_cache_bits(m->md.mdpg_cache_attrs); + /* + * Set modified bit gratuitously for writeable mappings if + * the page is unmanaged. We do not want to take a fault + * to do the dirty bit accounting for these mappings. + */ + if ((m->oflags & VPO_UNMANAGED) != 0) { + if ((newpte & PG_RW) != 0) + newpte |= PG_M; + } else + newpte |= PG_MANAGED; + + lock = NULL; + PMAP_LOCK(pmap); + if (psind == 1) { + /* Assert the required virtual and physical alignment. */ + KASSERT((va & L3_PAGE_MASK) == 0, ("pmap_enter: va unaligned")); + KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); + rv = pmap_enter_l3e(pmap, va, newpte | RPTE_LEAF, flags, m, &lock); + goto out; + } + mpte = NULL; + + /* + * In the case that a page table page is not + * resident, we are creating it here. + */ +retry: + l3e = pmap_pml3e(pmap, va); + if (l3e != NULL && (*l3e & PG_V) != 0 && ((*l3e & RPTE_LEAF) == 0 || + pmap_demote_l3e_locked(pmap, l3e, va, &lock))) { + pte = pmap_l3e_to_pte(l3e, va); + if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { + mpte = PHYS_TO_VM_PAGE(*l3e & PG_FRAME); + mpte->ref_count++; + } + } else if (va < VM_MAXUSER_ADDRESS) { + /* + * Here if the pte page isn't mapped, or if it has been + * deallocated. + */ + nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; + mpte = _pmap_allocpte(pmap, pmap_l3e_pindex(va), + nosleep ? NULL : &lock); + if (mpte == NULL && nosleep) { + rv = KERN_RESOURCE_SHORTAGE; + goto out; + } + if (__predict_false(retrycount++ == 6)) + panic("too many retries"); + invalidate_all = true; + goto retry; + } else + panic("pmap_enter: invalid page directory va=%#lx", va); + + origpte = *pte; + pv = NULL; + + /* + * Is the specified virtual address already mapped? + */ + if ((origpte & PG_V) != 0) { +#ifdef INVARIANTS + if (VERBOSE_PMAP || pmap_logging) { + printf("cow fault pmap_enter(%p, %#lx, %p, %#x, %x, %d) --" + " asid=%lu curpid=%d name=%s origpte0x%lx\n", + pmap, va, m, prot, flags, psind, pmap->pm_pid, + curproc->p_pid, curproc->p_comm, origpte); + pmap_pte_walk(pmap->pm_pml1, va); + } +#endif + /* + * Wiring change, just update stats. We don't worry about + * wiring PT pages as they remain resident as long as there + * are valid mappings in them. Hence, if a user page is wired, + * the PT page will be also. + */ + if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) + pmap->pm_stats.wired_count++; + else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) + pmap->pm_stats.wired_count--; + + /* + * Remove the extra PT page reference. + */ + if (mpte != NULL) { + mpte->ref_count--; + KASSERT(mpte->ref_count > 0, + ("pmap_enter: missing reference to page table page," + " va: 0x%lx", va)); + } + + /* + * Has the physical page changed? + */ + opa = origpte & PG_FRAME; + if (opa == pa) { + /* + * No, might be a protection or wiring change. + */ + if ((origpte & PG_MANAGED) != 0 && + (newpte & PG_RW) != 0) + vm_page_aflag_set(m, PGA_WRITEABLE); + if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) { + if ((newpte & (PG_A|PG_M)) != (origpte & (PG_A|PG_M))) { + if (!atomic_cmpset_long(pte, origpte, newpte)) + goto retry; + if ((newpte & PG_M) != (origpte & PG_M)) + vm_page_dirty(m); + if ((newpte & PG_A) != (origpte & PG_A)) + vm_page_aflag_set(m, PGA_REFERENCED); + ptesync(); + } else + invalidate_all = true; + if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) + goto unchanged; + } + goto validate; + } + + /* + * The physical page has changed. Temporarily invalidate + * the mapping. This ensures that all threads sharing the + * pmap keep a consistent view of the mapping, which is + * necessary for the correct handling of COW faults. It + * also permits reuse of the old mapping's PV entry, + * avoiding an allocation. + * + * For consistency, handle unmanaged mappings the same way. + */ + origpte = pte_load_clear(pte); + KASSERT((origpte & PG_FRAME) == opa, + ("pmap_enter: unexpected pa update for %#lx", va)); + if ((origpte & PG_MANAGED) != 0) { + om = PHYS_TO_VM_PAGE(opa); + + /* + * The pmap lock is sufficient to synchronize with + * concurrent calls to pmap_page_test_mappings() and + * pmap_ts_referenced(). + */ + if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(om); + if ((origpte & PG_A) != 0) + vm_page_aflag_set(om, PGA_REFERENCED); + CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); + pv = pmap_pvh_remove(&om->md, pmap, va); + if ((newpte & PG_MANAGED) == 0) + free_pv_entry(pmap, pv); +#ifdef INVARIANTS + else if (origpte & PG_MANAGED) { + if (pv == NULL) { + pmap_page_print_mappings(om); + MPASS(pv != NULL); + } + } +#endif + if ((om->a.flags & PGA_WRITEABLE) != 0 && + TAILQ_EMPTY(&om->md.pv_list) && + ((om->flags & PG_FICTITIOUS) != 0 || + TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) + vm_page_aflag_clear(om, PGA_WRITEABLE); + } + if ((origpte & PG_A) != 0) + invalidate_page = true; + origpte = 0; + } else { + if (pmap != kernel_pmap) { +#ifdef INVARIANTS + if (VERBOSE_PMAP || pmap_logging) + printf("pmap_enter(%p, %#lx, %p, %#x, %x, %d) -- asid=%lu curpid=%d name=%s\n", + pmap, va, m, prot, flags, psind, + pmap->pm_pid, curproc->p_pid, + curproc->p_comm); +#endif + } + + /* + * Increment the counters. + */ + if ((newpte & PG_W) != 0) + pmap->pm_stats.wired_count++; + pmap_resident_count_inc(pmap, 1); + } + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((newpte & PG_MANAGED) != 0) { + if (pv == NULL) { + pv = get_pv_entry(pmap, &lock); + pv->pv_va = va; + } +#ifdef VERBOSE_PV + else + printf("reassigning pv: %p to pmap: %p\n", + pv, pmap); +#endif + CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); + m->md.pv_gen++; + if ((newpte & PG_RW) != 0) + vm_page_aflag_set(m, PGA_WRITEABLE); + } + + /* + * Update the PTE. + */ + if ((origpte & PG_V) != 0) { +validate: + origpte = pte_load_store(pte, newpte); + KASSERT((origpte & PG_FRAME) == pa, + ("pmap_enter: unexpected pa update for %#lx", va)); + if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == + (PG_M | PG_RW)) { + if ((origpte & PG_MANAGED) != 0) + vm_page_dirty(m); + invalidate_page = true; + + /* + * Although the PTE may still have PG_RW set, TLB + * invalidation may nonetheless be required because + * the PTE no longer has PG_M set. + */ + } else if ((origpte & PG_X) != 0 || (newpte & PG_X) == 0) { + /* + * Removing capabilities requires invalidation on POWER + */ + invalidate_page = true; + goto unchanged; + } + if ((origpte & PG_A) != 0) + invalidate_page = true; + } else { + pte_store(pte, newpte); + ptesync(); + } +unchanged: + +#if VM_NRESERVLEVEL > 0 + /* + * If both the page table page and the reservation are fully + * populated, then attempt promotion. + */ + if ((mpte == NULL || mpte->ref_count == NPTEPG) && + mmu_radix_ps_enabled(mmu, pmap) && + (m->flags & PG_FICTITIOUS) == 0 && + vm_reserv_level_iffullpop(m) == 0 && + pmap_promote_l3e(pmap, l3e, va, &lock) == 0) + invalidate_all = true; +#endif + if (invalidate_all) + pmap_invalidate_all(pmap); + else if (invalidate_page) + pmap_invalidate_page(pmap, va); + + rv = KERN_SUCCESS; +out: + if (lock != NULL) + rw_wunlock(lock); + PMAP_UNLOCK(pmap); + + return (rv); +} + + +/* + * Tries to create a read- and/or execute-only 2MB page mapping. Returns true + * if successful. Returns false if (1) a page table page cannot be allocated + * without sleeping, (2) a mapping already exists at the specified virtual + * address, or (3) a PV entry cannot be allocated without reclaiming another + * PV entry. + */ +static bool +pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, + struct rwlock **lockp) +{ + pml3_entry_t newpde; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs) | + RPTE_LEAF | PG_V; + if ((m->oflags & VPO_UNMANAGED) == 0) + newpde |= PG_MANAGED; + if (prot & VM_PROT_EXECUTE) + newpde |= PG_X; + if (prot & VM_PROT_READ) + newpde |= RPTE_EAA_R; + if (va >= DMAP_MIN_ADDRESS) + newpde |= RPTE_EAA_P; + return (pmap_enter_l3e(pmap, va, newpde, PMAP_ENTER_NOSLEEP | + PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == + KERN_SUCCESS); +} + +/* + * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if + * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE + * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and + * a mapping already exists at the specified virtual address. Returns + * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table + * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if + * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. + * + * The parameter "m" is only used when creating a managed, writeable mapping. + */ +static int +pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags, + vm_page_t m, struct rwlock **lockp) +{ + struct spglist free; + pml3_entry_t oldl3e, *l3e; + vm_page_t mt, pdpg; + struct epoch_tracker et; + + KASSERT((newpde & (PG_M | PG_RW)) != PG_RW, + ("pmap_enter_pde: newpde is missing PG_M")); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + if ((pdpg = pmap_allocl3e(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? + NULL : lockp)) == NULL) { + CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (KERN_RESOURCE_SHORTAGE); + } + l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); + l3e = &l3e[pmap_pml3e_index(va)]; + oldl3e = *l3e; + if ((oldl3e & PG_V) != 0) { + KASSERT(pdpg->ref_count > 1, + ("pmap_enter_pde: pdpg's wire count is too low")); + if ((flags & PMAP_ENTER_NOREPLACE) != 0) { + pdpg->ref_count--; + CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (KERN_FAILURE); + } + /* Break the existing mapping(s). */ + SLIST_INIT(&free); + if ((oldl3e & RPTE_LEAF) != 0) { + /* + * The reference to the PD page that was acquired by + * pmap_allocl3e() ensures that it won't be freed. + * However, if the PDE resulted from a promotion, then + * a reserved PT page could be freed. + */ + (void)pmap_remove_l3e(pmap, l3e, va, &free, lockp); + } else { + pmap_delayed_invl_started(&et); + if (pmap_remove_ptes(pmap, va, va + L3_PAGE_SIZE, l3e, + &free, lockp)) + pmap_invalidate_all(pmap); + pmap_delayed_invl_finished(&et); + } + vm_page_free_pages_toq(&free, true); + if (va >= VM_MAXUSER_ADDRESS) { + mt = PHYS_TO_VM_PAGE(*l3e & PG_FRAME); + if (pmap_insert_pt_page(pmap, mt)) { + /* + * XXX Currently, this can't happen because + * we do not perform pmap_enter(psind == 1) + * on the kernel pmap. + */ + panic("pmap_enter_pde: trie insert failed"); + } + } else + KASSERT(*l3e == 0, ("pmap_enter_pde: non-zero pde %p", + l3e)); + } + if ((newpde & PG_MANAGED) != 0) { + /* + * Abort this mapping if its PV entry could not be created. + */ + if (!pmap_pv_insert_l3e(pmap, va, newpde, flags, lockp)) { + SLIST_INIT(&free); + if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { + /* + * Although "va" is not mapped, paging- + * structure caches could nonetheless have + * entries that refer to the freed page table + * pages. Invalidate those entries. + */ + pmap_invalidate_page(pmap, va); + vm_page_free_pages_toq(&free, true); + } + CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (KERN_RESOURCE_SHORTAGE); + } + if ((newpde & PG_RW) != 0) { + for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) + vm_page_aflag_set(mt, PGA_WRITEABLE); + } + } + + /* + * Increment counters. + */ + if ((newpde & PG_W) != 0) + pmap->pm_stats.wired_count += L3_PAGE_SIZE / PAGE_SIZE; + pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); + + /* + * Map the superpage. (This is not a promoted mapping; there will not + * be any lingering 4KB page mappings in the TLB.) + */ + pte_store(l3e, newpde); + + atomic_add_long(&pmap_l3e_mappings, 1); + CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" + " in pmap %p", va, pmap); + return (KERN_SUCCESS); +} + +void +mmu_radix_enter_object(mmu_t mmu, pmap_t pmap, vm_offset_t start, + vm_offset_t end, vm_page_t m_start, vm_prot_t prot) +{ + + struct rwlock *lock; + vm_offset_t va; + vm_page_t m, mpte; + vm_pindex_t diff, psize; + bool invalidate; + VM_OBJECT_ASSERT_LOCKED(m_start->object); + + CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start, + end, m_start, prot); + + invalidate = false; + psize = atop(end - start); + mpte = NULL; + m = m_start; + lock = NULL; + PMAP_LOCK(pmap); + while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { + va = start + ptoa(diff); + if ((va & L3_PAGE_MASK) == 0 && va + L3_PAGE_SIZE <= end && + m->psind == 1 && mmu_radix_ps_enabled(mmu, pmap) && + pmap_enter_2mpage(pmap, va, m, prot, &lock)) + m = &m[L3_PAGE_SIZE / PAGE_SIZE - 1]; + else + mpte = mmu_radix_enter_quick_locked(pmap, va, m, prot, + mpte, &lock, &invalidate); + m = TAILQ_NEXT(m, listq); + } + ptesync(); + if (lock != NULL) + rw_wunlock(lock); + if (invalidate) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); +} + +static vm_page_t +mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate) +{ + struct spglist free; + pt_entry_t *pte; + vm_paddr_t pa; + + KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || + (m->oflags & VPO_UNMANAGED) != 0, + ("mmu_radix_enter_quick_locked: managed mapping within the clean submap")); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * In the case that a page table page is not + * resident, we are creating it here. + */ + if (va < VM_MAXUSER_ADDRESS) { + vm_pindex_t ptepindex; + pml3_entry_t *ptepa; + + /* + * Calculate pagetable page index + */ + ptepindex = pmap_l3e_pindex(va); + if (mpte && (mpte->pindex == ptepindex)) { + mpte->ref_count++; + } else { + /* + * Get the page directory entry + */ + ptepa = pmap_pml3e(pmap, va); + + /* + * If the page table page is mapped, we just increment + * the hold count, and activate it. Otherwise, we + * attempt to allocate a page table page. If this + * attempt fails, we don't retry. Instead, we give up. + */ + if (ptepa && (*ptepa & PG_V) != 0) { + if (*ptepa & RPTE_LEAF) + return (NULL); + mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); + mpte->ref_count++; + } else { + /* + * Pass NULL instead of the PV list lock + * pointer, because we don't intend to sleep. + */ + mpte = _pmap_allocpte(pmap, ptepindex, NULL); + if (mpte == NULL) + return (mpte); + } + } + pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); + pte = &pte[pmap_pte_index(va)]; + } else { + mpte = NULL; + pte = pmap_pte(pmap, va); + } + if (*pte) { + if (mpte != NULL) { + mpte->ref_count--; + mpte = NULL; + } + return (mpte); + } + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((m->oflags & VPO_UNMANAGED) == 0 && + !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { + if (mpte != NULL) { + SLIST_INIT(&free); + if (pmap_unwire_ptp(pmap, va, mpte, &free)) { + /* + * Although "va" is not mapped, paging- + * structure caches could nonetheless have + * entries that refer to the freed page table + * pages. Invalidate those entries. + */ + *invalidate = true; + vm_page_free_pages_toq(&free, true); + } + mpte = NULL; + } + return (mpte); + } + + /* + * Increment counters + */ + pmap_resident_count_inc(pmap, 1); + + pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs); + if (prot & VM_PROT_EXECUTE) + pa |= PG_X; + else + pa |= RPTE_EAA_R; + if ((m->oflags & VPO_UNMANAGED) == 0) + pa |= PG_MANAGED; + + pte_store(pte, pa); + return (mpte); +} + +void +mmu_radix_enter_quick(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot) +{ + struct rwlock *lock; + bool invalidate; + + lock = NULL; + invalidate = false; + PMAP_LOCK(pmap); + mmu_radix_enter_quick_locked(pmap, va, m, prot, NULL, &lock, + &invalidate); + ptesync(); + if (lock != NULL) + rw_wunlock(lock); + if (invalidate) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); +} + +vm_paddr_t +mmu_radix_extract(mmu_t mmu, pmap_t pmap, vm_offset_t va) +{ + pml3_entry_t *l3e; + pt_entry_t *pte; + vm_paddr_t pa; + + l3e = pmap_pml3e(pmap, va); + if (__predict_false(l3e == NULL)) + return (0); + if (*l3e & RPTE_LEAF) { + pa = (*l3e & PG_PS_FRAME) | (va & L3_PAGE_MASK); + pa |= (va & L3_PAGE_MASK); + } else { + /* + * Beware of a concurrent promotion that changes the + * PDE at this point! For example, vtopte() must not + * be used to access the PTE because it would use the + * new PDE. It is, however, safe to use the old PDE + * because the page table page is preserved by the + * promotion. + */ + pte = pmap_l3e_to_pte(l3e, va); + if (__predict_false(pte == NULL)) + return (0); + pa = *pte; + pa = (pa & PG_FRAME) | (va & PAGE_MASK); + pa |= (va & PAGE_MASK); + } + return (pa); +} + +vm_page_t +mmu_radix_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot) +{ + pml3_entry_t l3e, *l3ep; + pt_entry_t pte; + vm_paddr_t pa; + vm_page_t m; + + pa = 0; + m = NULL; + CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot); + PMAP_LOCK(pmap); + l3ep = pmap_pml3e(pmap, va); + if (l3ep != NULL && (l3e = *l3ep)) { + if (l3e & RPTE_LEAF) { + if ((l3e & PG_RW) || (prot & VM_PROT_WRITE) == 0) + m = PHYS_TO_VM_PAGE((l3e & PG_PS_FRAME) | + (va & L3_PAGE_MASK)); + } else { + pte = *pmap_l3e_to_pte(l3ep, va); + if ((pte & PG_V) && + ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) + m = PHYS_TO_VM_PAGE(pte & PG_FRAME); + } + if (m != NULL && !vm_page_wire_mapped(m)) + m = NULL; + } + PMAP_UNLOCK(pmap); + return (m); +} + +static void +mmu_radix_growkernel(mmu_t mmu, vm_offset_t addr) +{ + vm_paddr_t paddr; + vm_page_t nkpg; + pml3_entry_t *l3e; + pml2_entry_t *l2e; + + CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); + if (VM_MIN_KERNEL_ADDRESS < addr && + addr < (VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE)) + return; + + addr = roundup2(addr, L3_PAGE_SIZE); + if (addr - 1 >= vm_map_max(kernel_map)) + addr = vm_map_max(kernel_map); + while (kernel_vm_end < addr) { + l2e = pmap_pml2e(kernel_pmap, kernel_vm_end); + if ((*l2e & PG_V) == 0) { + /* We need a new PDP entry */ + nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_PAGE_SIZE_SHIFT, + VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (nkpg == NULL) + panic("pmap_growkernel: no memory to grow kernel"); + if ((nkpg->flags & PG_ZERO) == 0) + mmu_radix_zero_page(mmu, nkpg); + paddr = VM_PAGE_TO_PHYS(nkpg); + pde_store(l2e, paddr); + continue; /* try again */ + } + l3e = pmap_l2e_to_l3e(l2e, kernel_vm_end); + if ((*l3e & PG_V) != 0) { + kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { + kernel_vm_end = vm_map_max(kernel_map); + break; + } + continue; + } + + nkpg = vm_page_alloc(NULL, pmap_l3e_pindex(kernel_vm_end), + VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + if (nkpg == NULL) + panic("pmap_growkernel: no memory to grow kernel"); + if ((nkpg->flags & PG_ZERO) == 0) + mmu_radix_zero_page(mmu, nkpg); + paddr = VM_PAGE_TO_PHYS(nkpg); + pde_store(l3e, paddr); + + kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { + kernel_vm_end = vm_map_max(kernel_map); + break; + } + } + ptesync(); +} + +static MALLOC_DEFINE(M_RADIX_PGD, "radix_pgd", "radix page table root directory"); +static uma_zone_t zone_radix_pgd; + +static int +radix_pgd_import(void *arg __unused, void **store, int count, int domain __unused, + int flags) +{ + + for (int i = 0; i < count; i++) { + vm_page_t m = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO | VM_ALLOC_WAITOK, RADIX_PGD_SIZE/PAGE_SIZE, + 0, (vm_paddr_t)-1, RADIX_PGD_SIZE, L1_PAGE_SIZE, + VM_MEMATTR_DEFAULT); + /* XXX zero on alloc here so we don't have to later */ + store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); + } + return (count); +} + +static void +radix_pgd_release(void *arg __unused, void **store, int count) +{ + vm_page_t m; + struct spglist free; + int page_count; + + SLIST_INIT(&free); + page_count = RADIX_PGD_SIZE/PAGE_SIZE; + + for (int i = 0; i < count; i++) { + /* + * XXX selectively remove dmap and KVA entries so we don't + * need to bzero + */ + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i])); + for (int j = page_count-1; j >= 0; j--) { + vm_page_unwire_noq(&m[j]); + SLIST_INSERT_HEAD(&free, &m[j], plinks.s.ss); + } + vm_page_free_pages_toq(&free, false); + } +} + +static void +mmu_radix_init(mmu_t mmu) +{ + vm_page_t mpte; + vm_size_t s; + int error, i, pv_npg; + + /* L1TF, reserve page @0 unconditionally */ + vm_page_blacklist_add(0, bootverbose); + + zone_radix_pgd = uma_zcache_create("radix_pgd_cache", + RADIX_PGD_SIZE, NULL, NULL, +#ifdef INVARIANTS + trash_init, trash_fini, +#else + NULL, NULL, +#endif + radix_pgd_import, radix_pgd_release, + NULL, UMA_ZONE_NOBUCKET); + + /* + * Initialize the vm page array entries for the kernel pmap's + * page table pages. + */ + PMAP_LOCK(kernel_pmap); + for (i = 0; i < nkpt; i++) { + mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); + KASSERT(mpte >= vm_page_array && + mpte < &vm_page_array[vm_page_array_size], + ("pmap_init: page table page is out of range size: %lu", + vm_page_array_size)); + mpte->pindex = pmap_l3e_pindex(VM_MIN_KERNEL_ADDRESS) + i; + mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); + MPASS(PHYS_TO_VM_PAGE(mpte->phys_addr) == mpte); + //pmap_insert_pt_page(kernel_pmap, mpte); + mpte->ref_count = 1; + } + PMAP_UNLOCK(kernel_pmap); + vm_wire_add(nkpt); + + CTR1(KTR_PMAP, "%s()", __func__); + TAILQ_INIT(&pv_dummy.pv_list); + + /* + * Are large page mappings enabled? + */ + TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); + if (pg_ps_enabled) { + KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, + ("pmap_init: can't assign to pagesizes[1]")); + pagesizes[1] = L3_PAGE_SIZE; + } + + /* + * Initialize the pv chunk list mutex. + */ + mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); + + /* + * Initialize the pool of pv list locks. + */ + for (i = 0; i < NPV_LIST_LOCKS; i++) + rw_init(&pv_list_locks[i], "pmap pv list"); + + /* + * Calculate the size of the pv head table for superpages. + */ + pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L3_PAGE_SIZE); + + /* + * Allocate memory for the pv head table for superpages. + */ + s = (vm_size_t)(pv_npg * sizeof(struct md_page)); + s = round_page(s); + pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); + for (i = 0; i < pv_npg; i++) + TAILQ_INIT(&pv_table[i].pv_list); + TAILQ_INIT(&pv_dummy.pv_list); + + pmap_initialized = 1; + mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); + error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, + (vmem_addr_t *)&qframe); + + if (error != 0) + panic("qframe allocation failed"); + asid_arena = vmem_create("ASID", isa3_base_pid + 1, (1<md.pv_list, pv_link) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pte(pmap, pv->pv_va); + mask = 0; + if (modified) + mask |= PG_RW | PG_M; + if (accessed) + mask |= PG_V | PG_A; + rv = (*pte & mask) == mask; + PMAP_UNLOCK(pmap); + if (rv) + goto out; + } + if ((m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + pvh_gen = pvh->pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen || + pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pml3e(pmap, pv->pv_va); + mask = 0; + if (modified) + mask |= PG_RW | PG_M; + if (accessed) + mask |= PG_V | PG_A; + rv = (*pte & mask) == mask; + PMAP_UNLOCK(pmap); + if (rv) + goto out; + } + } +out: + rw_runlock(lock); + return (rv); +} + +/* + * pmap_is_modified: + * + * Return whether or not the specified physical page was modified + * in any physical maps. + */ +boolean_t +mmu_radix_is_modified(mmu_t mmu, vm_page_t m) +{ + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_is_modified: page %p is not managed", m)); + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + /* + * If the page is not busied then this check is racy. + */ + if (!pmap_page_is_write_mapped(m)) + return (FALSE); + return (pmap_page_test_mappings(m, FALSE, TRUE)); +} + +boolean_t +mmu_radix_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t addr) +{ + pml3_entry_t *l3e; + pt_entry_t *pte; + boolean_t rv; + + CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); + rv = FALSE; + PMAP_LOCK(pmap); + l3e = pmap_pml3e(pmap, addr); + if (l3e != NULL && (*l3e & (RPTE_LEAF | PG_V)) == PG_V) { + pte = pmap_l3e_to_pte(l3e, addr); + rv = (*pte & PG_V) == 0; + } + PMAP_UNLOCK(pmap); + return (rv); +} + +boolean_t +mmu_radix_is_referenced(mmu_t mmu, vm_page_t m) +{ + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_is_referenced: page %p is not managed", m)); + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + return (pmap_page_test_mappings(m, TRUE, FALSE)); +} + +/* + * pmap_ts_referenced: + * + * Return a count of reference bits for a page, clearing those bits. + * It is not necessary for every reference bit to be cleared, but it + * is necessary that 0 only be returned when there are truly no + * reference bits set. + * + * As an optimization, update the page's dirty field if a modified bit is + * found while counting reference bits. This opportunistic update can be + * performed at low cost and can eliminate the need for some future calls + * to pmap_is_modified(). However, since this function stops after + * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some + * dirty pages. Those dirty pages will only be detected by a future call + * to pmap_is_modified(). + * + * A DI block is not needed within this function, because + * invalidations are performed before the PV list lock is + * released. + */ +boolean_t +mmu_radix_ts_referenced(mmu_t mmu, vm_page_t m) +{ + struct md_page *pvh; + pv_entry_t pv, pvf; + pmap_t pmap; + struct rwlock *lock; + pml3_entry_t oldl3e, *l3e; + pt_entry_t *pte; + vm_paddr_t pa; + int cleared, md_gen, not_cleared, pvh_gen; + struct spglist free; + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_ts_referenced: page %p is not managed", m)); + SLIST_INIT(&free); + cleared = 0; + pa = VM_PAGE_TO_PHYS(m); + lock = PHYS_TO_PV_LIST_LOCK(pa); + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); + rw_wlock(lock); +retry: + not_cleared = 0; + if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) + goto small_mappings; + pv = pvf; + do { + if (pvf == NULL) + pvf = pv; + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto retry; + } + } + l3e = pmap_pml3e(pmap, pv->pv_va); + oldl3e = *l3e; + if ((oldl3e & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + /* + * Although "oldpde" is mapping a 2MB page, because + * this function is called at a 4KB page granularity, + * we only update the 4KB page under test. + */ + vm_page_dirty(m); + } + if ((oldl3e & PG_A) != 0) { + /* + * Since this reference bit is shared by 512 4KB + * pages, it should not be cleared every time it is + * tested. Apply a simple "hash" function on the + * physical page number, the virtual superpage number, + * and the pmap address to select one 4KB page out of + * the 512 on which testing the reference bit will + * result in clearing that reference bit. This + * function is designed to avoid the selection of the + * same 4KB page for every 2MB page mapping. + * + * On demotion, a mapping that hasn't been referenced + * is simply destroyed. To avoid the possibility of a + * subsequent page fault on a demoted wired mapping, + * always leave its reference bit set. Moreover, + * since the superpage is wired, the current state of + * its reference bit won't affect page replacement. + */ + if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L3_PAGE_SIZE_SHIFT) ^ + (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && + (oldl3e & PG_W) == 0) { + atomic_clear_long(l3e, PG_A); + pmap_invalidate_page(pmap, pv->pv_va); + cleared++; + KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), + ("inconsistent pv lock %p %p for page %p", + lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); + } else + not_cleared++; + } + PMAP_UNLOCK(pmap); + /* Rotate the PV list if it has more than one entry. */ + if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); + pvh->pv_gen++; + } + if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) + goto out; + } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); +small_mappings: + if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) + goto out; + pv = pvf; + do { + if (pvf == NULL) + pvf = pv; + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + md_gen = m->md.pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto retry; + } + } + l3e = pmap_pml3e(pmap, pv->pv_va); + KASSERT((*l3e & RPTE_LEAF) == 0, + ("pmap_ts_referenced: found a 2mpage in page %p's pv list", + m)); + pte = pmap_l3e_to_pte(l3e, pv->pv_va); + if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if ((*pte & PG_A) != 0) { + atomic_clear_long(pte, PG_A); + pmap_invalidate_page(pmap, pv->pv_va); + cleared++; + } + PMAP_UNLOCK(pmap); + /* Rotate the PV list if it has more than one entry. */ + if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) { + TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); + m->md.pv_gen++; + } + } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + + not_cleared < PMAP_TS_REFERENCED_MAX); +out: + rw_wunlock(lock); + vm_page_free_pages_toq(&free, true); + return (cleared + not_cleared); +} + +static vm_offset_t +mmu_radix_map(mmu_t mmu, vm_offset_t *virt __unused, vm_paddr_t start, + vm_paddr_t end, int prot __unused) +{ + + CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end, + prot); + return (PHYS_TO_DMAP(start)); +} + +void +mmu_radix_object_init_pt(mmu_t mmu, pmap_t pmap, vm_offset_t addr, + vm_object_t object, vm_pindex_t pindex, vm_size_t size) +{ + pml3_entry_t *l3e; + vm_paddr_t pa, ptepa; + vm_page_t p, pdpg; + vm_memattr_t ma; + + CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr, + object, pindex, size); + VM_OBJECT_ASSERT_WLOCKED(object); + KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, + ("pmap_object_init_pt: non-device object")); + /* NB: size can be logically ored with addr here */ + if ((addr & L3_PAGE_MASK) == 0 && (size & L3_PAGE_MASK) == 0) { + if (!mmu_radix_ps_enabled(mmu, pmap)) + return; + if (!vm_object_populate(object, pindex, pindex + atop(size))) + return; + p = vm_page_lookup(object, pindex); + KASSERT(p->valid == VM_PAGE_BITS_ALL, + ("pmap_object_init_pt: invalid page %p", p)); + ma = p->md.mdpg_cache_attrs; + + /* + * Abort the mapping if the first page is not physically + * aligned to a 2MB page boundary. + */ + ptepa = VM_PAGE_TO_PHYS(p); + if (ptepa & L3_PAGE_MASK) + return; + + /* + * Skip the first page. Abort the mapping if the rest of + * the pages are not physically contiguous or have differing + * memory attributes. + */ + p = TAILQ_NEXT(p, listq); + for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; + pa += PAGE_SIZE) { + KASSERT(p->valid == VM_PAGE_BITS_ALL, + ("pmap_object_init_pt: invalid page %p", p)); + if (pa != VM_PAGE_TO_PHYS(p) || + ma != p->md.mdpg_cache_attrs) + return; + p = TAILQ_NEXT(p, listq); + } + + PMAP_LOCK(pmap); + for (pa = ptepa | pmap_cache_bits(ma); + pa < ptepa + size; pa += L3_PAGE_SIZE) { + pdpg = pmap_allocl3e(pmap, addr, NULL); + if (pdpg == NULL) { + /* + * The creation of mappings below is only an + * optimization. If a page directory page + * cannot be allocated without blocking, + * continue on to the next mapping rather than + * blocking. + */ + addr += L3_PAGE_SIZE; + continue; + } + l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); + l3e = &l3e[pmap_pml3e_index(addr)]; + if ((*l3e & PG_V) == 0) { + pa |= PG_M | PG_A | PG_RW; + pte_store(l3e, pa); + pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); + atomic_add_long(&pmap_l3e_mappings, 1); + } else { + /* Continue on if the PDE is already valid. */ + pdpg->ref_count--; + KASSERT(pdpg->ref_count > 0, + ("pmap_object_init_pt: missing reference " + "to page directory page, va: 0x%lx", addr)); + } + addr += L3_PAGE_SIZE; + } + ptesync(); + PMAP_UNLOCK(pmap); + } +} + +boolean_t +mmu_radix_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m) +{ + struct md_page *pvh; + struct rwlock *lock; + pv_entry_t pv; + int loops = 0; + boolean_t rv; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_page_exists_quick: page %p is not managed", m)); + CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m); + rv = FALSE; + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { + if (PV_PMAP(pv) == pmap) { + rv = TRUE; + break; + } + loops++; + if (loops >= 16) + break; + } + if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { + if (PV_PMAP(pv) == pmap) { + rv = TRUE; + break; + } + loops++; + if (loops >= 16) + break; + } + } + rw_runlock(lock); + return (rv); +} + +void +mmu_radix_page_init(mmu_t mmu, vm_page_t m) +{ + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + TAILQ_INIT(&m->md.pv_list); + m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; +} + +int +mmu_radix_page_wired_mappings(mmu_t mmu, vm_page_t m) +{ + struct rwlock *lock; + struct md_page *pvh; + pmap_t pmap; + pt_entry_t *pte; + pv_entry_t pv; + int count, md_gen, pvh_gen; + + if ((m->oflags & VPO_UNMANAGED) != 0) + return (0); + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); +restart: + count = 0; + TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pte(pmap, pv->pv_va); + if ((*pte & PG_W) != 0) + count++; + PMAP_UNLOCK(pmap); + } + if ((m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + pvh_gen = pvh->pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen || + pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pml3e(pmap, pv->pv_va); + if ((*pte & PG_W) != 0) + count++; + PMAP_UNLOCK(pmap); + } + } + rw_runlock(lock); + return (count); +} + +static void +mmu_radix_update_proctab(int pid, pml1_entry_t l1pa) +{ + isa3_proctab[pid].proctab0 = htobe64(RTS_SIZE | l1pa | RADIX_PGD_INDEX_SHIFT); +} + +void +mmu_radix_pinit(mmu_t mmu, pmap_t pmap) +{ + vmem_addr_t pid; + vm_paddr_t l1pa; + + CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); + + /* + * allocate the page directory page + */ + pmap->pm_pml1 = uma_zalloc(zone_radix_pgd, M_WAITOK); + + for (int j = 0; j < RADIX_PGD_SIZE_SHIFT; j++) + pagezero((vm_offset_t)pmap->pm_pml1 + j * PAGE_SIZE); + pmap->pm_radix.rt_root = 0; + TAILQ_INIT(&pmap->pm_pvchunk); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + pmap->pm_flags = PMAP_PDE_SUPERPAGE; + vmem_alloc(asid_arena, 1, M_FIRSTFIT|M_WAITOK, &pid); + + pmap->pm_pid = pid; + l1pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml1); + mmu_radix_update_proctab(pid, l1pa); + __asm __volatile("ptesync;isync" : : : "memory"); +} + +/* + * This routine is called if the desired page table page does not exist. + * + * If page table page allocation fails, this routine may sleep before + * returning NULL. It sleeps only if a lock pointer was given. + * + * Note: If a page allocation fails at page table level two or three, + * one or two pages may be held during the wait, only to be released + * afterwards. This conservative approach is easily argued to avoid + * race conditions. + */ +static vm_page_t +_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) +{ + vm_page_t m, pdppg, pdpg; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * Allocate a page table page. + */ + if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + if (lockp != NULL) { + RELEASE_PV_LIST_LOCK(lockp); + PMAP_UNLOCK(pmap); + PMAP_ASSERT_NOT_IN_DI(); + vm_wait(NULL); + PMAP_LOCK(pmap); + } + /* + * Indicate the need to retry. While waiting, the page table + * page may have been allocated. + */ + return (NULL); + } + if ((m->flags & PG_ZERO) == 0) + mmu_radix_zero_page(NULL, m); + + /* + * Map the pagetable page into the process address space, if + * it isn't already there. + */ + + if (ptepindex >= (NUPDE + NUPDPE)) { + pml1_entry_t *l1e; + vm_pindex_t pml1index; + + /* Wire up a new PDPE page */ + pml1index = ptepindex - (NUPDE + NUPDPE); + l1e = &pmap->pm_pml1[pml1index]; + pde_store(l1e, VM_PAGE_TO_PHYS(m)); + + } else if (ptepindex >= NUPDE) { + vm_pindex_t pml1index; + vm_pindex_t pdpindex; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + + /* Wire up a new l2e page */ + pdpindex = ptepindex - NUPDE; + pml1index = pdpindex >> RPTE_SHIFT; + + l1e = &pmap->pm_pml1[pml1index]; + if ((*l1e & PG_V) == 0) { + /* Have to allocate a new pdp, recurse */ + if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml1index, + lockp) == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + } else { + /* Add reference to l2e page */ + pdppg = PHYS_TO_VM_PAGE(*l1e & PG_FRAME); + pdppg->ref_count++; + } + l2e = (pml2_entry_t *)PHYS_TO_DMAP(*l1e & PG_FRAME); + + /* Now find the pdp page */ + l2e = &l2e[pdpindex & RPTE_MASK]; + pde_store(l2e, VM_PAGE_TO_PHYS(m)); + + } else { + vm_pindex_t pml1index; + vm_pindex_t pdpindex; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t *l3e; + + /* Wire up a new PTE page */ + pdpindex = ptepindex >> RPTE_SHIFT; + pml1index = pdpindex >> RPTE_SHIFT; + + /* First, find the pdp and check that its valid. */ + l1e = &pmap->pm_pml1[pml1index]; + if ((*l1e & PG_V) == 0) { + /* Have to allocate a new pd, recurse */ + if (_pmap_allocpte(pmap, NUPDE + pdpindex, + lockp) == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + l2e = (pml2_entry_t *)PHYS_TO_DMAP(*l1e & PG_FRAME); + l2e = &l2e[pdpindex & RPTE_MASK]; + } else { + l2e = (pml2_entry_t *)PHYS_TO_DMAP(*l1e & PG_FRAME); + l2e = &l2e[pdpindex & RPTE_MASK]; + if ((*l2e & PG_V) == 0) { + /* Have to allocate a new pd, recurse */ + if (_pmap_allocpte(pmap, NUPDE + pdpindex, + lockp) == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + } else { + /* Add reference to the pd page */ + pdpg = PHYS_TO_VM_PAGE(*l2e & PG_FRAME); + pdpg->ref_count++; + } + } + l3e = (pml3_entry_t *)PHYS_TO_DMAP(*l2e & PG_FRAME); + + /* Now we know where the page directory page is */ + l3e = &l3e[ptepindex & RPTE_MASK]; + pde_store(l3e, VM_PAGE_TO_PHYS(m)); + } + + pmap_resident_count_inc(pmap, 1); + return (m); +} +static vm_page_t +pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) +{ + vm_pindex_t pdpindex, ptepindex; + pml2_entry_t *pdpe; + vm_page_t pdpg; + +retry: + pdpe = pmap_pml2e(pmap, va); + if (pdpe != NULL && (*pdpe & PG_V) != 0) { + /* Add a reference to the pd page. */ + pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); + pdpg->ref_count++; + } else { + /* Allocate a pd page. */ + ptepindex = pmap_l3e_pindex(va); + pdpindex = ptepindex >> RPTE_SHIFT; + pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); + if (pdpg == NULL && lockp != NULL) + goto retry; + } + return (pdpg); +} + +static vm_page_t +pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) +{ + vm_pindex_t ptepindex; + pml3_entry_t *pd; + vm_page_t m; + + /* + * Calculate pagetable page index + */ + ptepindex = pmap_l3e_pindex(va); +retry: + /* + * Get the page directory entry + */ + pd = pmap_pml3e(pmap, va); + + /* + * This supports switching from a 2MB page to a + * normal 4K page. + */ + if (pd != NULL && (*pd & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V)) { + if (!pmap_demote_l3e_locked(pmap, pd, va, lockp)) { + /* + * Invalidation of the 2MB page mapping may have caused + * the deallocation of the underlying PD page. + */ + pd = NULL; + } + } + + /* + * If the page table page is mapped, we just increment the + * hold count, and activate it. + */ + if (pd != NULL && (*pd & PG_V) != 0) { + m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); + m->ref_count++; + } else { + /* + * Here if the pte page isn't mapped, or if it has been + * deallocated. + */ + m = _pmap_allocpte(pmap, ptepindex, lockp); + if (m == NULL && lockp != NULL) + goto retry; + } + return (m); +} + +static void +mmu_radix_pinit0(mmu_t mmu, pmap_t pmap) +{ + + CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); + PMAP_LOCK_INIT(pmap); + pmap->pm_pml1 = kernel_pmap->pm_pml1; + pmap->pm_pid = kernel_pmap->pm_pid; + + pmap->pm_radix.rt_root = 0; + TAILQ_INIT(&pmap->pm_pvchunk); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + kernel_pmap->pm_flags = + pmap->pm_flags = PMAP_PDE_SUPERPAGE; +} +/* + * pmap_protect_l3e: do the things to protect a 2mpage in a process + */ +static boolean_t +pmap_protect_l3e(pmap_t pmap, pt_entry_t *l3e, vm_offset_t sva, vm_prot_t prot) +{ + pt_entry_t newpde, oldpde; + vm_offset_t eva, va; + vm_page_t m; + boolean_t anychanged; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & L3_PAGE_MASK) == 0, + ("pmap_protect_l3e: sva is not 2mpage aligned")); + anychanged = FALSE; +retry: + oldpde = newpde = *l3e; + if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == + (PG_MANAGED | PG_M | PG_RW)) { + eva = sva + L3_PAGE_SIZE; + for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); + va < eva; va += PAGE_SIZE, m++) + vm_page_dirty(m); + } + if ((prot & VM_PROT_WRITE) == 0) { + newpde &= ~(PG_RW | PG_M); + newpde |= RPTE_EAA_R; + } + if (prot & VM_PROT_EXECUTE) + newpde |= PG_X; + if (newpde != oldpde) { + /* + * As an optimization to future operations on this PDE, clear + * PG_PROMOTED. The impending invalidation will remove any + * lingering 4KB page mappings from the TLB. + */ + if (!atomic_cmpset_long(l3e, oldpde, newpde & ~PG_PROMOTED)) + goto retry; + anychanged = TRUE; + } + return (anychanged); +} + +void +mmu_radix_protect(mmu_t mmu, pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + vm_prot_t prot) +{ + vm_offset_t va_next; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t ptpaddr, *l3e; + pt_entry_t *pte; + boolean_t anychanged; + + CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, sva, eva, + prot); + + KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); + if (prot == VM_PROT_NONE) { + mmu_radix_remove(mmu, pmap, sva, eva); + return; + } + + if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == + (VM_PROT_WRITE|VM_PROT_EXECUTE)) + return; + +#ifdef INVARIANTS + if (VERBOSE_PROTECT || pmap_logging) + printf("pmap_protect(%p, %#lx, %#lx, %x) - asid: %lu\n", + pmap, sva, eva, prot, pmap->pm_pid); +#endif + anychanged = FALSE; + + /* + * Although this function delays and batches the invalidation + * of stale TLB entries, it does not need to call + * pmap_delayed_invl_started() and + * pmap_delayed_invl_finished(), because it does not + * ordinarily destroy mappings. Stale TLB entries from + * protection-only changes need only be invalidated before the + * pmap lock is released, because protection-only changes do + * not destroy PV entries. Even operations that iterate over + * a physical page's PV list of mappings, like + * pmap_remove_write(), acquire the pmap lock for each + * mapping. Consequently, for protection-only changes, the + * pmap lock suffices to synchronize both page table and TLB + * updates. + * + * This function only destroys a mapping if pmap_demote_l3e() + * fails. In that case, stale TLB entries are immediately + * invalidated. + */ + + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + l1e = pmap_pml1e(pmap, sva); + if ((*l1e & PG_V) == 0) { + va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + + l2e = pmap_l1e_to_l2e(l1e, sva); + if ((*l2e & PG_V) == 0) { + va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + + va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (va_next < sva) + va_next = eva; + + l3e = pmap_l2e_to_l3e(l2e, sva); + ptpaddr = *l3e; + + /* + * Weed out invalid mappings. + */ + if (ptpaddr == 0) + continue; + + /* + * Check for large page. + */ + if ((ptpaddr & RPTE_LEAF) != 0) { + /* + * Are we protecting the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { + if (pmap_protect_l3e(pmap, l3e, sva, prot)) + anychanged = TRUE; + continue; + } else if (!pmap_demote_l3e(pmap, l3e, sva)) { + /* + * The large page mapping was destroyed. + */ + continue; + } + } + + if (va_next > eva) + va_next = eva; + + for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, + sva += PAGE_SIZE) { + pt_entry_t obits, pbits; + vm_page_t m; + +retry: + MPASS(pte == pmap_pte(pmap, sva)); + obits = pbits = *pte; + if ((pbits & PG_V) == 0) + continue; + + if ((prot & VM_PROT_WRITE) == 0) { + if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == + (PG_MANAGED | PG_M | PG_RW)) { + m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); + vm_page_dirty(m); + } + pbits &= ~(PG_RW | PG_M); + pbits |= RPTE_EAA_R; + } + if (prot & VM_PROT_EXECUTE) + pbits |= PG_X; + + if (pbits != obits) { + if (!atomic_cmpset_long(pte, obits, pbits)) + goto retry; + if (obits & (PG_A|PG_M)) { + anychanged = TRUE; +#ifdef INVARIANTS + if (VERBOSE_PROTECT || pmap_logging) + printf("%#lx %#lx -> %#lx\n", + sva, obits, pbits); +#endif + } + } + } + } + if (anychanged) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); +} + +void +mmu_radix_qenter(mmu_t mmu, vm_offset_t sva, vm_page_t *ma, int count) +{ + + CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, sva, ma, count); + pt_entry_t oldpte, pa, *pte; + vm_page_t m; + uint64_t cache_bits, attr_bits; + vm_offset_t va; + + oldpte = 0; + attr_bits = RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; + va = sva; + pte = kvtopte(va); + while (va < sva + PAGE_SIZE * count) { + if (__predict_false((va & L3_PAGE_MASK) == 0)) + pte = kvtopte(va); + MPASS(pte == pmap_pte(kernel_pmap, va)); + + /* + * XXX there has to be a more efficient way than traversing + * the page table every time - but go for correctness for + * today + */ + + m = *ma++; + cache_bits = pmap_cache_bits(m->md.mdpg_cache_attrs); + pa = VM_PAGE_TO_PHYS(m) | cache_bits | attr_bits; + if (*pte != pa) { + oldpte |= *pte; + pte_store(pte, pa); + } + va += PAGE_SIZE; + pte++; + } + if (__predict_false((oldpte & RPTE_VALID) != 0)) + pmap_invalidate_range(kernel_pmap, sva, sva + count * + PAGE_SIZE); + else + ptesync(); +} + +void +mmu_radix_qremove(mmu_t mmu, vm_offset_t sva, int count) +{ + vm_offset_t va; + pt_entry_t *pte; + + CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, sva, count); + KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode or dmap va %lx", sva)); + + va = sva; + pte = kvtopte(va); + while (va < sva + PAGE_SIZE * count) { + if (__predict_false((va & L3_PAGE_MASK) == 0)) + pte = kvtopte(va); + pte_clear(pte); + pte++; + va += PAGE_SIZE; + } + pmap_invalidate_range(kernel_pmap, sva, va); +} + +/*************************************************** + * Page table page management routines..... + ***************************************************/ +/* + * Schedule the specified unused page table page to be freed. Specifically, + * add the page to the specified list of pages that will be released to the + * physical memory manager after the TLB has been updated. + */ +static __inline void +pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, + boolean_t set_PG_ZERO) +{ + + if (set_PG_ZERO) + m->flags |= PG_ZERO; + else + m->flags &= ~PG_ZERO; + SLIST_INSERT_HEAD(free, m, plinks.s.ss); +} + +/* + * Inserts the specified page table page into the specified pmap's collection + * of idle page table pages. Each of a pmap's page table pages is responsible + * for mapping a distinct range of virtual addresses. The pmap's collection is + * ordered by this virtual address range. + */ +static __inline int +pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + return (vm_radix_insert(&pmap->pm_radix, mpte)); +} + +/* + * Removes the page table page mapping the specified virtual address from the + * specified pmap's collection of idle page table pages, and returns it. + * Otherwise, returns NULL if there is no page table page corresponding to the + * specified virtual address. + */ +static __inline vm_page_t +pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + return (vm_radix_remove(&pmap->pm_radix, pmap_l3e_pindex(va))); +} + +/* + * Decrements a page table page's wire count, which is used to record the + * number of valid page table entries within the page. If the wire count + * drops to zero, then the page table page is unmapped. Returns TRUE if the + * page table page was unmapped and FALSE otherwise. + */ +static inline boolean_t +pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) +{ + + --m->ref_count; + if (m->ref_count == 0) { + _pmap_unwire_ptp(pmap, va, m, free); + return (TRUE); + } else + return (FALSE); +} + +static void +_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* + * unmap the page table page + */ + if (m->pindex >= (NUPDE + NUPDPE)) { + /* PDP page */ + pml1_entry_t *pml1; + pml1 = pmap_pml1e(pmap, va); + *pml1 = 0; + } else if (m->pindex >= NUPDE) { + /* PD page */ + pml2_entry_t *l2e; + l2e = pmap_pml2e(pmap, va); + *l2e = 0; + } else { + /* PTE page */ + pml3_entry_t *l3e; + l3e = pmap_pml3e(pmap, va); + *l3e = 0; + } + pmap_resident_count_dec(pmap, 1); + if (m->pindex < NUPDE) { + /* We just released a PT, unhold the matching PD */ + vm_page_t pdpg; + + pdpg = PHYS_TO_VM_PAGE(*pmap_pml2e(pmap, va) & PG_FRAME); + pmap_unwire_ptp(pmap, va, pdpg, free); + } + if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { + /* We just released a PD, unhold the matching PDP */ + vm_page_t pdppg; + + pdppg = PHYS_TO_VM_PAGE(*pmap_pml1e(pmap, va) & PG_FRAME); + pmap_unwire_ptp(pmap, va, pdppg, free); + } + + /* + * Put page on a list so that it is released after + * *ALL* TLB shootdown is done + */ + pmap_add_delayed_free_list(m, free, TRUE); +} + +/* + * After removing a page table entry, this routine is used to + * conditionally free the page, and manage the hold/wire counts. + */ +static int +pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pml3_entry_t ptepde, + struct spglist *free) +{ + vm_page_t mpte; + + if (va >= VM_MAXUSER_ADDRESS) + return (0); + KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); + mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); + return (pmap_unwire_ptp(pmap, va, mpte, free)); +} + +void +mmu_radix_release(mmu_t mmu, pmap_t pmap) +{ + + CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); + KASSERT(pmap->pm_stats.resident_count == 0, + ("pmap_release: pmap resident count %ld != 0", + pmap->pm_stats.resident_count)); + KASSERT(vm_radix_is_empty(&pmap->pm_radix), + ("pmap_release: pmap has reserved page table page(s)")); + + pmap_invalidate_all(pmap); + isa3_proctab[pmap->pm_pid].proctab0 = 0; + uma_zfree(zone_radix_pgd, pmap->pm_pml1); + vmem_free(asid_arena, pmap->pm_pid, 1); +} + +/* + * Create the PV entry for a 2MB page mapping. Always returns true unless the + * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns + * false if the PV entry cannot be allocated without resorting to reclamation. + */ +static bool +pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t pde, u_int flags, + struct rwlock **lockp) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_paddr_t pa; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* Pass NULL instead of the lock pointer to disable reclamation. */ + if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? + NULL : lockp)) == NULL) + return (false); + pv->pv_va = va; + pa = pde & PG_PS_FRAME; + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); + pvh->pv_gen++; + return (true); +} + +/* + * Fills a page table page with mappings to consecutive physical pages. + */ +static void +pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) +{ + pt_entry_t *pte; + + for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { + *pte = newpte; + newpte += PAGE_SIZE; + } +} + +static boolean_t +pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va) +{ + struct rwlock *lock; + boolean_t rv; + + lock = NULL; + rv = pmap_demote_l3e_locked(pmap, pde, va, &lock); + if (lock != NULL) + rw_wunlock(lock); + return (rv); +} + +static boolean_t +pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, + struct rwlock **lockp) +{ + pml3_entry_t oldpde; + pt_entry_t *firstpte; + vm_paddr_t mptepa; + vm_page_t mpte; + struct spglist free; + vm_offset_t sva; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + oldpde = *l3e; + KASSERT((oldpde & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), + ("pmap_demote_l3e: oldpde is missing RPTE_LEAF and/or PG_V %lx", + oldpde)); + if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == + NULL) { + KASSERT((oldpde & PG_W) == 0, + ("pmap_demote_l3e: page table page for a wired mapping" + " is missing")); + + /* + * Invalidate the 2MB page mapping and return "failure" if the + * mapping was never accessed or the allocation of the new + * page table page fails. If the 2MB page mapping belongs to + * the direct map region of the kernel's address space, then + * the page allocation request specifies the highest possible + * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is + * normal. Page table pages are preallocated for every other + * part of the kernel address space, so the direct map region + * is the only part of the kernel address space that must be + * handled here. + */ + if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, + pmap_l3e_pindex(va), (va >= DMAP_MIN_ADDRESS && va < + DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { + SLIST_INIT(&free); + sva = trunc_2mpage(va); + pmap_remove_l3e(pmap, l3e, sva, &free, lockp); + pmap_invalidate_l3e_page(pmap, sva, oldpde); + vm_page_free_pages_toq(&free, true); + CTR2(KTR_PMAP, "pmap_demote_l3e: failure for va %#lx" + " in pmap %p", va, pmap); + return (FALSE); + } + if (va < VM_MAXUSER_ADDRESS) + pmap_resident_count_inc(pmap, 1); + } + mptepa = VM_PAGE_TO_PHYS(mpte); + firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); + KASSERT((oldpde & PG_A) != 0, + ("pmap_demote_l3e: oldpde is missing PG_A")); + KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, + ("pmap_demote_l3e: oldpde is missing PG_M")); + + /* + * If the page table page is new, initialize it. + */ + if (mpte->ref_count == 1) { + mpte->ref_count = NPTEPG; + pmap_fill_ptp(firstpte, oldpde); + } + + KASSERT((*firstpte & PG_FRAME) == (oldpde & PG_FRAME), + ("pmap_demote_l3e: firstpte and newpte map different physical" + " addresses")); + + /* + * If the mapping has changed attributes, update the page table + * entries. + */ + if ((*firstpte & PG_PTE_PROMOTE) != (oldpde & PG_PTE_PROMOTE)) + pmap_fill_ptp(firstpte, oldpde); + + /* + * The spare PV entries must be reserved prior to demoting the + * mapping, that is, prior to changing the PDE. Otherwise, the state + * of the PDE and the PV lists will be inconsistent, which can result + * in reclaim_pv_chunk() attempting to remove a PV entry from the + * wrong PV list and pmap_pv_demote_l3e() failing to find the expected + * PV entry for the 2MB page mapping that is being demoted. + */ + if ((oldpde & PG_MANAGED) != 0) + reserve_pv_entries(pmap, NPTEPG - 1, lockp); + + /* + * Demote the mapping. This pmap is locked. The old PDE has + * PG_A set. If the old PDE has PG_RW set, it also has PG_M + * set. Thus, there is no danger of a race with another + * processor changing the setting of PG_A and/or PG_M between + * the read above and the store below. + */ + pde_store(l3e, mptepa); + ptesync(); + /* + * Demote the PV entry. + */ + if ((oldpde & PG_MANAGED) != 0) + pmap_pv_demote_l3e(pmap, va, oldpde & PG_PS_FRAME, lockp); + + + atomic_add_long(&pmap_l3e_demotions, 1); + CTR2(KTR_PMAP, "pmap_demote_l3e: success for va %#lx" + " in pmap %p", va, pmap); + return (TRUE); +} + +/* + * pmap_remove_kernel_pde: Remove a kernel superpage mapping. + */ +static void +pmap_remove_kernel_l3e(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va) +{ + vm_paddr_t mptepa; + vm_page_t mpte; + + KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mpte = pmap_remove_pt_page(pmap, va); + if (mpte == NULL) + panic("pmap_remove_kernel_pde: Missing pt page."); + + mptepa = VM_PAGE_TO_PHYS(mpte); + + /* + * Initialize the page table page. + */ + pagezero(PHYS_TO_DMAP(mptepa)); + + /* + * Demote the mapping. + */ + pde_store(l3e, mptepa); + ptesync(); +} + +/* + * pmap_remove_l3e: do the things to unmap a superpage in a process + */ +static int +pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, + struct spglist *free, struct rwlock **lockp) +{ + struct md_page *pvh; + pml3_entry_t oldpde; + vm_offset_t eva, va; + vm_page_t m, mpte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & L3_PAGE_MASK) == 0, + ("pmap_remove_l3e: sva is not 2mpage aligned")); + oldpde = pte_load_clear(pdq); + if (oldpde & PG_W) + pmap->pm_stats.wired_count -= (L3_PAGE_SIZE / PAGE_SIZE); + pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); + if (oldpde & PG_MANAGED) { + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); + pvh = pa_to_pvh(oldpde & PG_PS_FRAME); + pmap_pvh_free(pvh, pmap, sva); + eva = sva + L3_PAGE_SIZE; + for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); + va < eva; va += PAGE_SIZE, m++) { + if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if (oldpde & PG_A) + vm_page_aflag_set(m, PGA_REFERENCED); + if (TAILQ_EMPTY(&m->md.pv_list) && + TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } + if (pmap == kernel_pmap) { + pmap_remove_kernel_l3e(pmap, pdq, sva); + } else { + mpte = pmap_remove_pt_page(pmap, sva); + if (mpte != NULL) { + pmap_resident_count_dec(pmap, 1); + KASSERT(mpte->ref_count == NPTEPG, + ("pmap_remove_l3e: pte page wire count error")); + mpte->ref_count = 0; + pmap_add_delayed_free_list(mpte, free, FALSE); + } + } + return (pmap_unuse_pt(pmap, sva, *pmap_pml2e(pmap, sva), free)); +} + + +/* + * pmap_remove_pte: do the things to unmap a page in a process + */ +static int +pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, + pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp) +{ + struct md_page *pvh; + pt_entry_t oldpte; + vm_page_t m; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + oldpte = pte_load_clear(ptq); + if (oldpte & RPTE_WIRED) + pmap->pm_stats.wired_count -= 1; + pmap_resident_count_dec(pmap, 1); + if (oldpte & RPTE_MANAGED) { + m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); + if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if (oldpte & PG_A) + vm_page_aflag_set(m, PGA_REFERENCED); + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); + pmap_pvh_free(&m->md, pmap, va); + if (TAILQ_EMPTY(&m->md.pv_list) && + (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } + return (pmap_unuse_pt(pmap, va, ptepde, free)); +} + +/* + * Remove a single page from a process address space + */ +static bool +pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *l3e, + struct spglist *free) +{ + struct rwlock *lock; + pt_entry_t *pte; + bool invalidate_all; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if ((*l3e & RPTE_VALID) == 0) { + return (false); + } + pte = pmap_l3e_to_pte(l3e, va); + if ((*pte & RPTE_VALID) == 0) { + return (false); + } + lock = NULL; + + invalidate_all = pmap_remove_pte(pmap, pte, va, *l3e, free, &lock); + if (lock != NULL) + rw_wunlock(lock); + if (!invalidate_all) + pmap_invalidate_page(pmap, va); + return (invalidate_all); +} + +/* + * Removes the specified range of addresses from the page table page. + */ +static bool +pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp) +{ + pt_entry_t *pte; + vm_offset_t va; + bool anyvalid; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + anyvalid = false; + va = eva; + for (pte = pmap_l3e_to_pte(l3e, sva); sva != eva; pte++, + sva += PAGE_SIZE) { + MPASS(pte == pmap_pte(pmap, sva)); + if (*pte == 0) { + if (va != eva) { + anyvalid = true; + va = eva; + } + continue; + } + if (va == eva) + va = sva; + if (pmap_remove_pte(pmap, pte, sva, *l3e, free, lockp)) { + anyvalid = true; + sva += PAGE_SIZE; + break; + } + } + if (anyvalid) + pmap_invalidate_all(pmap); + else if (va != eva) + pmap_invalidate_range(pmap, va, sva); + return (anyvalid); +} + + +void +mmu_radix_remove(mmu_t mmu, pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + struct rwlock *lock; + vm_offset_t va_next; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t ptpaddr, *l3e; + struct spglist free; + struct epoch_tracker et; + bool anyvalid; + + CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva); + + /* + * Perform an unsynchronized read. This is, however, safe. + */ + if (pmap->pm_stats.resident_count == 0) + return; + + anyvalid = false; + SLIST_INIT(&free); + + /* XXX something fishy here */ + sva = (sva + PAGE_MASK) & ~PAGE_MASK; + eva = (eva + PAGE_MASK) & ~PAGE_MASK; + + pmap_delayed_invl_started(&et); + PMAP_LOCK(pmap); + + /* + * special handling of removing one page. a very + * common operation and easy to short circuit some + * code. + */ + if (sva + PAGE_SIZE == eva) { + l3e = pmap_pml3e(pmap, sva); + if (l3e && (*l3e & RPTE_LEAF) == 0) { + anyvalid = pmap_remove_page(pmap, sva, l3e, &free); + goto out; + } + } + + lock = NULL; + for (; sva < eva; sva = va_next) { + + if (pmap->pm_stats.resident_count == 0) + break; + l1e = pmap_pml1e(pmap, sva); + if (l1e == NULL || (*l1e & PG_V) == 0) { + va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + + l2e = pmap_l1e_to_l2e(l1e, sva); + if (l2e == NULL || (*l2e & PG_V) == 0) { + va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + + /* + * Calculate index for next page table. + */ + va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (va_next < sva) + va_next = eva; + + l3e = pmap_l2e_to_l3e(l2e, sva); + ptpaddr = *l3e; + + /* + * Weed out invalid mappings. + */ + if (ptpaddr == 0) + continue; + + /* + * Check for large page. + */ + if ((ptpaddr & RPTE_LEAF) != 0) { + /* + * Are we removing the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { + pmap_remove_l3e(pmap, l3e, sva, &free, &lock); + continue; + } else if (!pmap_demote_l3e_locked(pmap, l3e, sva, + &lock)) { + /* The large page mapping was destroyed. */ + continue; + } else + ptpaddr = *l3e; + } + + /* + * Limit our scan to either the end of the va represented + * by the current page table page, or to the end of the + * range being removed. + */ + if (va_next > eva) + va_next = eva; + + if (pmap_remove_ptes(pmap, sva, va_next, l3e, &free, &lock)) + anyvalid = true; + } + if (lock != NULL) + rw_wunlock(lock); +out: + if (anyvalid) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); + pmap_delayed_invl_finished(&et); + vm_page_free_pages_toq(&free, true); +} + +void +mmu_radix_remove_all(mmu_t mmu, vm_page_t m) +{ + struct md_page *pvh; + pv_entry_t pv; + pmap_t pmap; + struct rwlock *lock; + pt_entry_t *pte, tpte; + pml3_entry_t *l3e; + vm_offset_t va; + struct spglist free; + int pvh_gen, md_gen; + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_remove_all: page %p is not managed", m)); + SLIST_INIT(&free); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : + pa_to_pvh(VM_PAGE_TO_PHYS(m)); +retry: + rw_wlock(lock); + while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + rw_wunlock(lock); + PMAP_UNLOCK(pmap); + goto retry; + } + } + va = pv->pv_va; + l3e = pmap_pml3e(pmap, va); + (void)pmap_demote_l3e_locked(pmap, l3e, va, &lock); + PMAP_UNLOCK(pmap); + } + while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + md_gen = m->md.pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { + rw_wunlock(lock); + PMAP_UNLOCK(pmap); + goto retry; + } + } + pmap_resident_count_dec(pmap, 1); + l3e = pmap_pml3e(pmap, pv->pv_va); + KASSERT((*l3e & RPTE_LEAF) == 0, ("pmap_remove_all: found" + " a 2mpage in page %p's pv list", m)); + pte = pmap_l3e_to_pte(l3e, pv->pv_va); + tpte = pte_load_clear(pte); + if (tpte & PG_W) + pmap->pm_stats.wired_count--; + if (tpte & PG_A) + vm_page_aflag_set(m, PGA_REFERENCED); + + /* + * Update the vm_page_t clean and reference bits. + */ + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + pmap_unuse_pt(pmap, pv->pv_va, *l3e, &free); + pmap_invalidate_page(pmap, pv->pv_va); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); + m->md.pv_gen++; + free_pv_entry(pmap, pv); + PMAP_UNLOCK(pmap); + } + vm_page_aflag_clear(m, PGA_WRITEABLE); + rw_wunlock(lock); + pmap_delayed_invl_wait(m); + vm_page_free_pages_toq(&free, true); +} + +/* + * Destroy all managed, non-wired mappings in the given user-space + * pmap. This pmap cannot be active on any processor besides the + * caller. + * + * This function cannot be applied to the kernel pmap. Moreover, it + * is not intended for general use. It is only to be used during + * process termination. Consequently, it can be implemented in ways + * that make it faster than pmap_remove(). First, it can more quickly + * destroy mappings by iterating over the pmap's collection of PV + * entries, rather than searching the page table. Second, it doesn't + * have to test and clear the page table entries atomically, because + * no processor is currently accessing the user address space. In + * particular, a page table entry's dirty bit won't change state once + * this function starts. + * + * Although this function destroys all of the pmap's managed, + * non-wired mappings, it can delay and batch the invalidation of TLB + * entries without calling pmap_delayed_invl_started() and + * pmap_delayed_invl_finished(). Because the pmap is not active on + * any other processor, none of these TLB entries will ever be used + * before their eventual invalidation. Consequently, there is no need + * for either pmap_remove_all() or pmap_remove_write() to wait for + * that eventual TLB invalidation. + */ + +void +mmu_radix_remove_pages(mmu_t mmu, pmap_t pmap) +{ + + CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); + pml3_entry_t ptel3e; + pt_entry_t *pte, tpte; + struct spglist free; + vm_page_t m, mpte, mt; + pv_entry_t pv; + struct md_page *pvh; + struct pv_chunk *pc, *npc; + struct rwlock *lock; + int64_t bit; + uint64_t inuse, bitmask; + int allfree, field, freed, idx; + boolean_t superpage; + vm_paddr_t pa; + + /* + * Assert that the given pmap is only active on the current + * CPU. Unfortunately, we cannot block another CPU from + * activating the pmap while this function is executing. + */ + KASSERT(pmap->pm_pid == mfspr(SPR_PID), + ("non-current asid %lu - expected %lu", pmap->pm_pid, + mfspr(SPR_PID))); + + lock = NULL; + + SLIST_INIT(&free); + PMAP_LOCK(pmap); + TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { + allfree = 1; + freed = 0; + for (field = 0; field < _NPCM; field++) { + inuse = ~pc->pc_map[field] & pc_freemask[field]; + while (inuse != 0) { + bit = cnttzd(inuse); + bitmask = 1UL << bit; + idx = field * 64 + bit; + pv = &pc->pc_pventry[idx]; + inuse &= ~bitmask; + + pte = pmap_pml2e(pmap, pv->pv_va); + ptel3e = *pte; + pte = pmap_l2e_to_l3e(pte, pv->pv_va); + tpte = *pte; + if ((tpte & (RPTE_LEAF | PG_V)) == PG_V) { + superpage = FALSE; + ptel3e = tpte; + pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & + PG_FRAME); + pte = &pte[pmap_pte_index(pv->pv_va)]; + tpte = *pte; + } else { + /* + * Keep track whether 'tpte' is a + * superpage explicitly instead of + * relying on RPTE_LEAF being set. + * + * This is because RPTE_LEAF is numerically + * identical to PG_PTE_PAT and thus a + * regular page could be mistaken for + * a superpage. + */ + superpage = TRUE; + } + + if ((tpte & PG_V) == 0) { + panic("bad pte va %lx pte %lx", + pv->pv_va, tpte); + } + +/* + * We cannot remove wired pages from a process' mapping at this time + */ + if (tpte & PG_W) { + allfree = 0; + continue; + } + + if (superpage) + pa = tpte & PG_PS_FRAME; + else + pa = tpte & PG_FRAME; + + m = PHYS_TO_VM_PAGE(pa); + KASSERT(m->phys_addr == pa, + ("vm_page_t %p phys_addr mismatch %016jx %016jx", + m, (uintmax_t)m->phys_addr, + (uintmax_t)tpte)); + + KASSERT((m->flags & PG_FICTITIOUS) != 0 || + m < &vm_page_array[vm_page_array_size], + ("pmap_remove_pages: bad tpte %#jx", + (uintmax_t)tpte)); + + pte_clear(pte); + + /* + * Update the vm_page_t clean/reference bits. + */ + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + if (superpage) { + for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) + vm_page_dirty(mt); + } else + vm_page_dirty(m); + } + + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); + + /* Mark free */ + pc->pc_map[field] |= bitmask; + if (superpage) { + pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); + pvh = pa_to_pvh(tpte & PG_PS_FRAME); + TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); + pvh->pv_gen++; + if (TAILQ_EMPTY(&pvh->pv_list)) { + for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) + if ((mt->a.flags & PGA_WRITEABLE) != 0 && + TAILQ_EMPTY(&mt->md.pv_list)) + vm_page_aflag_clear(mt, PGA_WRITEABLE); + } + mpte = pmap_remove_pt_page(pmap, pv->pv_va); + if (mpte != NULL) { + pmap_resident_count_dec(pmap, 1); + KASSERT(mpte->ref_count == NPTEPG, + ("pmap_remove_pages: pte page wire count error")); + mpte->ref_count = 0; + pmap_add_delayed_free_list(mpte, &free, FALSE); + } + } else { + pmap_resident_count_dec(pmap, 1); +#ifdef VERBOSE_PV + printf("freeing pv (%p, %p)\n", + pmap, pv); +#endif + TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); + m->md.pv_gen++; + if ((m->a.flags & PGA_WRITEABLE) != 0 && + TAILQ_EMPTY(&m->md.pv_list) && + (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } + pmap_unuse_pt(pmap, pv->pv_va, ptel3e, &free); + freed++; + } + } + PV_STAT(atomic_add_long(&pv_entry_frees, freed)); + PV_STAT(atomic_add_int(&pv_entry_spare, freed)); + PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); + if (allfree) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + free_pv_chunk(pc); + } + } + if (lock != NULL) + rw_wunlock(lock); + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); + vm_page_free_pages_toq(&free, true); +} + +void +mmu_radix_remove_write(mmu_t mmu, vm_page_t m) +{ + struct md_page *pvh; + pmap_t pmap; + struct rwlock *lock; + pv_entry_t next_pv, pv; + pml3_entry_t *l3e; + pt_entry_t oldpte, *pte; + int pvh_gen, md_gen; + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_remove_write: page %p is not managed", m)); + vm_page_assert_busied(m); + + if (!pmap_page_is_write_mapped(m)) + return; + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : + pa_to_pvh(VM_PAGE_TO_PHYS(m)); +retry_pv_loop: + rw_wlock(lock); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + rw_wunlock(lock); + goto retry_pv_loop; + } + } + l3e = pmap_pml3e(pmap, pv->pv_va); + if ((*l3e & PG_RW) != 0) + (void)pmap_demote_l3e_locked(pmap, l3e, pv->pv_va, &lock); + KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), + ("inconsistent pv lock %p %p for page %p", + lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); + PMAP_UNLOCK(pmap); + } + TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + md_gen = m->md.pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || + md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + rw_wunlock(lock); + goto retry_pv_loop; + } + } + l3e = pmap_pml3e(pmap, pv->pv_va); + KASSERT((*l3e & RPTE_LEAF) == 0, + ("pmap_remove_write: found a 2mpage in page %p's pv list", + m)); + pte = pmap_l3e_to_pte(l3e, pv->pv_va); +retry: + oldpte = *pte; + if (oldpte & PG_RW) { + if (!atomic_cmpset_long(pte, oldpte, + (oldpte | RPTE_EAA_R) & ~(PG_RW | PG_M))) + goto retry; + if ((oldpte & PG_M) != 0) + vm_page_dirty(m); + pmap_invalidate_page(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + rw_wunlock(lock); + vm_page_aflag_clear(m, PGA_WRITEABLE); + pmap_delayed_invl_wait(m); +} + +/* + * Clear the wired attribute from the mappings for the specified range of + * addresses in the given pmap. Every valid mapping within that range + * must have the wired attribute set. In contrast, invalid mappings + * cannot have the wired attribute set, so they are ignored. + * + * The wired attribute of the page table entry is not a hardware + * feature, so there is no need to invalidate any TLB entries. + * Since pmap_demote_l3e() for the wired entry must never fail, + * pmap_delayed_invl_started()/finished() calls around the + * function are not needed. + */ +void +mmu_radix_unwire(mmu_t mmu, pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t va_next; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t *l3e; + pt_entry_t *pte; + + CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva); + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + l1e = pmap_pml1e(pmap, sva); + if ((*l1e & PG_V) == 0) { + va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + l2e = pmap_l1e_to_l2e(l1e, sva); + if ((*l2e & PG_V) == 0) { + va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (va_next < sva) + va_next = eva; + l3e = pmap_l2e_to_l3e(l2e, sva); + if ((*l3e & PG_V) == 0) + continue; + if ((*l3e & RPTE_LEAF) != 0) { + if ((*l3e & PG_W) == 0) + panic("pmap_unwire: pde %#jx is missing PG_W", + (uintmax_t)*l3e); + + /* + * Are we unwiring the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { + atomic_clear_long(l3e, PG_W); + pmap->pm_stats.wired_count -= L3_PAGE_SIZE / + PAGE_SIZE; + continue; + } else if (!pmap_demote_l3e(pmap, l3e, sva)) + panic("pmap_unwire: demotion failed"); + } + if (va_next > eva) + va_next = eva; + for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, + sva += PAGE_SIZE) { + MPASS(pte == pmap_pte(pmap, sva)); + if ((*pte & PG_V) == 0) + continue; + if ((*pte & PG_W) == 0) + panic("pmap_unwire: pte %#jx is missing PG_W", + (uintmax_t)*pte); + + /* + * PG_W must be cleared atomically. Although the pmap + * lock synchronizes access to PG_W, another processor + * could be setting PG_M and/or PG_A concurrently. + */ + atomic_clear_long(pte, PG_W); + pmap->pm_stats.wired_count--; + } + } + PMAP_UNLOCK(pmap); +} + +void +mmu_radix_zero_page(mmu_t mmu, vm_page_t m) +{ + vm_offset_t addr; + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + addr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); + pagezero(addr); +} + +void +mmu_radix_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size) +{ + caddr_t addr; + + CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size); + MPASS(off + size <= PAGE_SIZE); + addr = (caddr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); + memset(addr + off, 0, size); +} + + + + +static int +mmu_radix_mincore(mmu_t mmu, pmap_t pmap, vm_offset_t addr, + vm_paddr_t *locked_pa) +{ + pml3_entry_t *l3ep; + pt_entry_t pte; + vm_paddr_t pa; + int val; + + CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); + PMAP_LOCK(pmap); + + l3ep = pmap_pml3e(pmap, addr); + if (l3ep != NULL && (*l3ep & PG_V)) { + if (*l3ep & RPTE_LEAF) { + pte = *l3ep; + /* Compute the physical address of the 4KB page. */ + pa = ((*l3ep & PG_PS_FRAME) | (addr & L3_PAGE_MASK)) & + PG_FRAME; + val = MINCORE_SUPER; + } else { + pte = *pmap_l3e_to_pte(l3ep, addr); + pa = pte & PG_FRAME; + val = 0; + } + } else { + pte = 0; + pa = 0; + val = 0; + } + if ((pte & PG_V) != 0) { + val |= MINCORE_INCORE; + if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; + if ((pte & PG_A) != 0) + val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; + } + if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != + (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && + (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { + *locked_pa = pa; + } + PMAP_UNLOCK(pmap); + return (val); +} + +void +mmu_radix_activate(mmu_t mmu, struct thread *td) +{ + pmap_t pmap; + uint32_t curpid; + + CTR2(KTR_PMAP, "%s(%p)", __func__, td); + critical_enter(); + pmap = vmspace_pmap(td->td_proc->p_vmspace); + curpid = mfspr(SPR_PID); + if (pmap->pm_pid > isa3_base_pid && + curpid != pmap->pm_pid) { + mmu_radix_pid_set(pmap); + } + critical_exit(); +} + +/* + * Increase the starting virtual address of the given mapping if a + * different alignment might result in more superpage mappings. + */ +void +mmu_radix_align_superpage(mmu_t mmu, vm_object_t object, vm_ooffset_t offset, + vm_offset_t *addr, vm_size_t size) +{ + + CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr, + size); + vm_offset_t superpage_offset; + + if (size < L3_PAGE_SIZE) + return; + if (object != NULL && (object->flags & OBJ_COLORED) != 0) + offset += ptoa(object->pg_color); + superpage_offset = offset & L3_PAGE_MASK; + if (size - ((L3_PAGE_SIZE - superpage_offset) & L3_PAGE_MASK) < L3_PAGE_SIZE || + (*addr & L3_PAGE_MASK) == superpage_offset) + return; + if ((*addr & L3_PAGE_MASK) < superpage_offset) + *addr = (*addr & ~L3_PAGE_MASK) + superpage_offset; + else + *addr = ((*addr + L3_PAGE_MASK) & ~L3_PAGE_MASK) + superpage_offset; +} + +static void * +mmu_radix_mapdev_attr(mmu_t mmu, vm_paddr_t pa, vm_size_t size, vm_memattr_t attr) +{ + vm_offset_t va, tmpva, ppa, offset; + + ppa = trunc_page(pa); + offset = pa & PAGE_MASK; + size = roundup2(offset + size, PAGE_SIZE); + if (pa < powerpc_ptob(Maxmem)) + panic("bad pa: %#lx less than Maxmem %#lx\n", + pa, powerpc_ptob(Maxmem)); + va = kva_alloc(size); + if (bootverbose) + printf("%s(%#lx, %lu, %d)\n", __func__, pa, size, attr); + KASSERT(size > 0, ("%s(%#lx, %lu, %d)", __func__, pa, size, attr)); + + if (!va) + panic("%s: Couldn't alloc kernel virtual memory", __func__); + + for (tmpva = va; size > 0;) { + mmu_radix_kenter_attr(mmu, tmpva, ppa, attr); + size -= PAGE_SIZE; + tmpva += PAGE_SIZE; + ppa += PAGE_SIZE; + } + ptesync(); + + return ((void *)(va + offset)); +} + +static void * +mmu_radix_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size) +{ + + CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); + + return (mmu_radix_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT)); +} + +void +mmu_radix_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma) +{ + + CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma); + m->md.mdpg_cache_attrs = ma; + + /* + * If "m" is a normal page, update its direct mapping. This update + * can be relied upon to perform any cache operations that are + * required for data coherence. + */ + if ((m->flags & PG_FICTITIOUS) == 0 && + mmu_radix_change_attr(mmu, PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), + PAGE_SIZE, m->md.mdpg_cache_attrs)) + panic("memory attribute change on the direct map failed"); +} + +static void +mmu_radix_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size) +{ + vm_offset_t offset; + + CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, va, size); + /* If we gave a direct map region in pmap_mapdev, do nothing */ + if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) + return; + + offset = va & PAGE_MASK; + size = round_page(offset + size); + va = trunc_page(va); + + if (pmap_initialized) + kva_free(va, size); +} + +static __inline void +pmap_pte_attr(pt_entry_t *pte, uint64_t cache_bits, uint64_t mask) +{ + uint64_t opte, npte; + + /* + * The cache mode bits are all in the low 32-bits of the + * PTE, so we can just spin on updating the low 32-bits. + */ + do { + opte = *pte; + npte = opte & ~mask; + npte |= cache_bits; + } while (npte != opte && !atomic_cmpset_long(pte, opte, npte)); +} + +/* + * Tries to demote a 1GB page mapping. + */ +static boolean_t +pmap_demote_l2e(pmap_t pmap, pml2_entry_t *l2e, vm_offset_t va) +{ + pml2_entry_t oldpdpe; + pml3_entry_t *firstpde, newpde, *pde; + vm_paddr_t pdpgpa; + vm_page_t pdpg; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + oldpdpe = *l2e; + KASSERT((oldpdpe & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), + ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); + pdpg = vm_page_alloc(NULL, va >> L2_PAGE_SIZE_SHIFT, + VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); + if (pdpg == NULL) { + CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" + " in pmap %p", va, pmap); + return (FALSE); + } + pdpgpa = VM_PAGE_TO_PHYS(pdpg); + firstpde = (pml3_entry_t *)PHYS_TO_DMAP(pdpgpa); + KASSERT((oldpdpe & PG_A) != 0, + ("pmap_demote_pdpe: oldpdpe is missing PG_A")); + KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, + ("pmap_demote_pdpe: oldpdpe is missing PG_M")); + newpde = oldpdpe; + + /* + * Initialize the page directory page. + */ + for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { + *pde = newpde; + newpde += L3_PAGE_SIZE; + } + + /* + * Demote the mapping. + */ + pde_store(l2e, pdpgpa); + + /* + * Flush PWC --- XXX revisit + */ + pmap_invalidate_all(pmap); + + pmap_l2e_demotions++; + CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" + " in pmap %p", va, pmap); + return (TRUE); +} + +vm_paddr_t +mmu_radix_kextract(mmu_t mmu, vm_offset_t va) +{ + pml3_entry_t l3e; + vm_paddr_t pa; + + CTR2(KTR_PMAP, "%s(%#x)", __func__, va); + if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { + pa = DMAP_TO_PHYS(va); + } else { + l3e = *pmap_pml3e(kernel_pmap, va); + if (l3e & RPTE_LEAF) { + pa = (l3e & PG_PS_FRAME) | (va & L3_PAGE_MASK); + pa |= (va & L3_PAGE_MASK); + } else { + /* + * Beware of a concurrent promotion that changes the + * PDE at this point! For example, vtopte() must not + * be used to access the PTE because it would use the + * new PDE. It is, however, safe to use the old PDE + * because the page table page is preserved by the + * promotion. + */ + pa = *pmap_l3e_to_pte(&l3e, va); + pa = (pa & PG_FRAME) | (va & PAGE_MASK); + pa |= (va & PAGE_MASK); + } + } + return (pa); +} + +static pt_entry_t +mmu_radix_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) +{ + + if (ma != VM_MEMATTR_DEFAULT) { + return pmap_cache_bits(ma); + } + + /* + * Assume the page is cache inhibited and access is guarded unless + * it's in our available memory array. + */ + for (int i = 0; i < pregions_sz; i++) { + if ((pa >= pregions[i].mr_start) && + (pa < (pregions[i].mr_start + pregions[i].mr_size))) + return (RPTE_ATTR_MEM); + } + return (RPTE_ATTR_GUARDEDIO); +} + +static void +mmu_radix_kenter_attr(mmu_t mmu, vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) +{ + pt_entry_t *pte, pteval; + uint64_t cache_bits; + + pte = kvtopte(va); + MPASS(pte != NULL); + pteval = pa | RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; + cache_bits = mmu_radix_calc_wimg(pa, ma); + pte_store(pte, pteval | cache_bits); +} + +void +mmu_radix_kremove(mmu_t mmu, vm_offset_t va) +{ + pt_entry_t *pte; + + CTR2(KTR_PMAP, "%s(%#x)", __func__, va); + + pte = kvtopte(va); + pte_clear(pte); +} + +int mmu_radix_map_user_ptr(mmu_t mmu, pmap_t pm, + volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen) +{ + if ((uintptr_t)uaddr + ulen >= VM_MAXUSER_ADDRESS) + return (EFAULT); + + *kaddr = (void *)(uintptr_t)uaddr; + if (klen) + *klen = ulen; + + return (0); +} + +int +mmu_radix_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, + int *is_user, vm_offset_t *decoded) +{ + + CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr); + *decoded = addr; + *is_user = (addr < VM_MAXUSER_ADDRESS); + return (0); +} + +static boolean_t +mmu_radix_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size) +{ + + CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); + return (mem_valid(pa, size)); +} + +static void +mmu_radix_scan_init(mmu_t mmup) +{ + + CTR1(KTR_PMAP, "%s()", __func__); + UNIMPLEMENTED(); +} + +static void +mmu_radix_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, + void **va) +{ + CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va); + UNIMPLEMENTED(); +} + +vm_offset_t +mmu_radix_quick_enter_page(mmu_t mmu, vm_page_t m) +{ + vm_paddr_t paddr; + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + paddr = VM_PAGE_TO_PHYS(m); + return (PHYS_TO_DMAP(paddr)); +} + +void +mmu_radix_quick_remove_page(mmu_t mmu, vm_offset_t addr __unused) +{ + /* no work to do here */ + CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); +} + +static void +pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) +{ + cpu_flush_dcache((void *)sva, eva - sva); +} + +int +mmu_radix_change_attr(mmu_t mmu, vm_offset_t va, vm_size_t size, + vm_memattr_t mode) +{ + int error; + + CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, va, size, mode); + PMAP_LOCK(kernel_pmap); + error = pmap_change_attr_locked(va, size, mode, true); + PMAP_UNLOCK(kernel_pmap); + return (error); +} + +static int +pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush) +{ + vm_offset_t base, offset, tmpva; + vm_paddr_t pa_start, pa_end, pa_end1; + pml2_entry_t *l2e; + pml3_entry_t *l3e; + pt_entry_t *pte; + int cache_bits, error; + boolean_t changed; + + PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); + base = trunc_page(va); + offset = va & PAGE_MASK; + size = round_page(offset + size); + + /* + * Only supported on kernel virtual addresses, including the direct + * map but excluding the recursive map. + */ + if (base < DMAP_MIN_ADDRESS) + return (EINVAL); + + cache_bits = pmap_cache_bits(mode); + changed = FALSE; + + /* + * Pages that aren't mapped aren't supported. Also break down 2MB pages + * into 4KB pages if required. + */ + for (tmpva = base; tmpva < base + size; ) { + l2e = pmap_pml2e(kernel_pmap, tmpva); + if (l2e == NULL || *l2e == 0) + return (EINVAL); + if (*l2e & RPTE_LEAF) { + /* + * If the current 1GB page already has the required + * memory type, then we need not demote this page. Just + * increment tmpva to the next 1GB page frame. + */ + if ((*l2e & RPTE_ATTR_MASK) == cache_bits) { + tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; + continue; + } + + /* + * If the current offset aligns with a 1GB page frame + * and there is at least 1GB left within the range, then + * we need not break down this page into 2MB pages. + */ + if ((tmpva & L2_PAGE_MASK) == 0 && + tmpva + L2_PAGE_MASK < base + size) { + tmpva += L2_PAGE_MASK; + continue; + } + if (!pmap_demote_l2e(kernel_pmap, l2e, tmpva)) + return (ENOMEM); + } + l3e = pmap_l2e_to_l3e(l2e, tmpva); + KASSERT(l3e != NULL, ("no l3e entry for %#lx in %p\n", + tmpva, l2e)); + if (*l3e == 0) + return (EINVAL); + if (*l3e & RPTE_LEAF) { + /* + * If the current 2MB page already has the required + * memory type, then we need not demote this page. Just + * increment tmpva to the next 2MB page frame. + */ + if ((*l3e & RPTE_ATTR_MASK) == cache_bits) { + tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; + continue; + } + + /* + * If the current offset aligns with a 2MB page frame + * and there is at least 2MB left within the range, then + * we need not break down this page into 4KB pages. + */ + if ((tmpva & L3_PAGE_MASK) == 0 && + tmpva + L3_PAGE_MASK < base + size) { + tmpva += L3_PAGE_SIZE; + continue; + } + if (!pmap_demote_l3e(kernel_pmap, l3e, tmpva)) + return (ENOMEM); + } + pte = pmap_l3e_to_pte(l3e, tmpva); + if (*pte == 0) + return (EINVAL); + tmpva += PAGE_SIZE; + } + error = 0; + + /* + * Ok, all the pages exist, so run through them updating their + * cache mode if required. + */ + pa_start = pa_end = 0; + for (tmpva = base; tmpva < base + size; ) { + l2e = pmap_pml2e(kernel_pmap, tmpva); + if (*l2e & RPTE_LEAF) { + if ((*l2e & RPTE_ATTR_MASK) != cache_bits) { + pmap_pte_attr(l2e, cache_bits, + RPTE_ATTR_MASK); + changed = TRUE; + } + if (tmpva >= VM_MIN_KERNEL_ADDRESS && + (*l2e & PG_PS_FRAME) < dmaplimit) { + if (pa_start == pa_end) { + /* Start physical address run. */ + pa_start = *l2e & PG_PS_FRAME; + pa_end = pa_start + L2_PAGE_SIZE; + } else if (pa_end == (*l2e & PG_PS_FRAME)) + pa_end += L2_PAGE_SIZE; + else { + /* Run ended, update direct map. */ + error = pmap_change_attr_locked( + PHYS_TO_DMAP(pa_start), + pa_end - pa_start, mode, flush); + if (error != 0) + break; + /* Start physical address run. */ + pa_start = *l2e & PG_PS_FRAME; + pa_end = pa_start + L2_PAGE_SIZE; + } + } + tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; + continue; + } + l3e = pmap_l2e_to_l3e(l2e, tmpva); + if (*l3e & RPTE_LEAF) { + if ((*l3e & RPTE_ATTR_MASK) != cache_bits) { + pmap_pte_attr(l3e, cache_bits, + RPTE_ATTR_MASK); + changed = TRUE; + } + if (tmpva >= VM_MIN_KERNEL_ADDRESS && + (*l3e & PG_PS_FRAME) < dmaplimit) { + if (pa_start == pa_end) { + /* Start physical address run. */ + pa_start = *l3e & PG_PS_FRAME; + pa_end = pa_start + L3_PAGE_SIZE; + } else if (pa_end == (*l3e & PG_PS_FRAME)) + pa_end += L3_PAGE_SIZE; + else { + /* Run ended, update direct map. */ + error = pmap_change_attr_locked( + PHYS_TO_DMAP(pa_start), + pa_end - pa_start, mode, flush); + if (error != 0) + break; + /* Start physical address run. */ + pa_start = *l3e & PG_PS_FRAME; + pa_end = pa_start + L3_PAGE_SIZE; + } + } + tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; + } else { + pte = pmap_l3e_to_pte(l3e, tmpva); + if ((*pte & RPTE_ATTR_MASK) != cache_bits) { + pmap_pte_attr(pte, cache_bits, + RPTE_ATTR_MASK); + changed = TRUE; + } + if (tmpva >= VM_MIN_KERNEL_ADDRESS && + (*pte & PG_FRAME) < dmaplimit) { + if (pa_start == pa_end) { + /* Start physical address run. */ + pa_start = *pte & PG_FRAME; + pa_end = pa_start + PAGE_SIZE; + } else if (pa_end == (*pte & PG_FRAME)) + pa_end += PAGE_SIZE; + else { + /* Run ended, update direct map. */ + error = pmap_change_attr_locked( + PHYS_TO_DMAP(pa_start), + pa_end - pa_start, mode, flush); + if (error != 0) + break; + /* Start physical address run. */ + pa_start = *pte & PG_FRAME; + pa_end = pa_start + PAGE_SIZE; + } + } + tmpva += PAGE_SIZE; + } + } + if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { + pa_end1 = MIN(pa_end, dmaplimit); + if (pa_start != pa_end1) + error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), + pa_end1 - pa_start, mode, flush); + } + + /* + * Flush CPU caches if required to make sure any data isn't cached that + * shouldn't be, etc. + */ + if (changed) { + pmap_invalidate_all(kernel_pmap); + + if (flush) + pmap_invalidate_cache_range(base, tmpva); + + } + return (error); +} + +/* + * Allocate physical memory for the vm_page array and map it into KVA, + * attempting to back the vm_pages with domain-local memory. + */ +void +mmu_radix_page_array_startup(mmu_t mmu, long pages) +{ +#ifdef notyet + pml2_entry_t *l2e; + pml3_entry_t *pde; + pml3_entry_t newl3; + vm_offset_t va; + long pfn; + int domain, i; +#endif + vm_paddr_t pa; + vm_offset_t start, end; + + vm_page_array_size = pages; + + start = VM_MIN_KERNEL_ADDRESS; + end = start + pages * sizeof(struct vm_page); + + pa = vm_phys_early_alloc(0, end - start); + + start = mmu_radix_map(mmu, &start, pa, end - start, VM_MEMATTR_DEFAULT); +#ifdef notyet + /* TODO: NUMA vm_page_array. Blocked out until then (copied from amd64). */ + for (va = start; va < end; va += L3_PAGE_SIZE) { + pfn = first_page + (va - start) / sizeof(struct vm_page); + domain = _vm_phys_domain(ptoa(pfn)); + l2e = pmap_pml2e(kernel_pmap, va); + if ((*l2e & PG_V) == 0) { + pa = vm_phys_early_alloc(domain, PAGE_SIZE); + dump_add_page(pa); + pagezero(PHYS_TO_DMAP(pa)); + pde_store(l2e, (pml2_entry_t)pa); + } + pde = pmap_l2e_to_l3e(l2e, va); + if ((*pde & PG_V) != 0) + panic("Unexpected pde %p", pde); + pa = vm_phys_early_alloc(domain, L3_PAGE_SIZE); + for (i = 0; i < NPDEPG; i++) + dump_add_page(pa + i * PAGE_SIZE); + newl3 = (pml3_entry_t)(pa | RPTE_EAA_P | RPTE_EAA_R | RPTE_EAA_W); + pte_store(pde, newl3); + } +#endif + vm_page_array = (vm_page_t)start; +} + +#ifdef DDB +#include +#include + +static void +pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va) +{ + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t *l3e; + pt_entry_t *pte; + + l1e = &l1[pmap_pml1e_index(va)]; + db_printf("VA %#016lx l1e %#016lx", va, *l1e); + if ((*l1e & PG_V) == 0) { + db_printf("\n"); + return; + } + l2e = pmap_l1e_to_l2e(l1e, va); + db_printf(" l2e %#016lx", *l2e); + if ((*l2e & PG_V) == 0 || (*l2e & RPTE_LEAF) != 0) { + db_printf("\n"); + return; + } + l3e = pmap_l2e_to_l3e(l2e, va); + db_printf(" l3e %#016lx", *l3e); + if ((*l3e & PG_V) == 0 || (*l3e & RPTE_LEAF) != 0) { + db_printf("\n"); + return; + } + pte = pmap_l3e_to_pte(l3e, va); + db_printf(" pte %#016lx\n", *pte); +} + +void +pmap_page_print_mappings(vm_page_t m) +{ + pmap_t pmap; + pv_entry_t pv; + + db_printf("page %p(%lx)\n", m, m->phys_addr); + /* need to elide locks if running in ddb */ + TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { + db_printf("pv: %p ", pv); + db_printf("va: %#016lx ", pv->pv_va); + pmap = PV_PMAP(pv); + db_printf("pmap %p ", pmap); + if (pmap != NULL) { + db_printf("asid: %lu\n", pmap->pm_pid); + pmap_pte_walk(pmap->pm_pml1, pv->pv_va); + } + } +} + +DB_SHOW_COMMAND(pte, pmap_print_pte) +{ + vm_offset_t va; + pmap_t pmap; + + if (!have_addr) { + db_printf("show pte addr\n"); + return; + } + va = (vm_offset_t)addr; + + if (va >= DMAP_MIN_ADDRESS) + pmap = kernel_pmap; + else if (kdb_thread != NULL) + pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); + else + pmap = vmspace_pmap(curthread->td_proc->p_vmspace); + + pmap_pte_walk(pmap->pm_pml1, va); +} + +#endif + Property changes on: head/sys/powerpc/aim/mmu_radix.c ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: head/sys/powerpc/booke/pmap.c =================================================================== --- head/sys/powerpc/booke/pmap.c (revision 360886) +++ head/sys/powerpc/booke/pmap.c (revision 360887) @@ -1,3138 +1,3147 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2007-2009 Semihalf, Rafal Jaworowski * Copyright (C) 2006 Semihalf, Marian Balakowicz * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Some hw specific parts of this pmap were derived or influenced * by NetBSD's ibm4xx pmap module. More generic code is shared with * a few other pmap modules from the FreeBSD tree. */ /* * VM layout notes: * * Kernel and user threads run within one common virtual address space * defined by AS=0. * * 32-bit pmap: * Virtual address space layout: * ----------------------------- * 0x0000_0000 - 0x7fff_ffff : user process * 0x8000_0000 - 0xbfff_ffff : pmap_mapdev()-ed area (PCI/PCIE etc.) * 0xc000_0000 - 0xc0ff_ffff : kernel reserved * 0xc000_0000 - data_end : kernel code+data, env, metadata etc. * 0xc100_0000 - 0xffff_ffff : KVA * 0xc100_0000 - 0xc100_3fff : reserved for page zero/copy * 0xc100_4000 - 0xc200_3fff : reserved for ptbl bufs * 0xc200_4000 - 0xc200_8fff : guard page + kstack0 * 0xc200_9000 - 0xfeef_ffff : actual free KVA space * * 64-bit pmap: * Virtual address space layout: * ----------------------------- * 0x0000_0000_0000_0000 - 0xbfff_ffff_ffff_ffff : user process * 0x0000_0000_0000_0000 - 0x8fff_ffff_ffff_ffff : text, data, heap, maps, libraries * 0x9000_0000_0000_0000 - 0xafff_ffff_ffff_ffff : mmio region * 0xb000_0000_0000_0000 - 0xbfff_ffff_ffff_ffff : stack * 0xc000_0000_0000_0000 - 0xcfff_ffff_ffff_ffff : kernel reserved * 0xc000_0000_0000_0000 - endkernel-1 : kernel code & data * endkernel - msgbufp-1 : flat device tree * msgbufp - kernel_pdir-1 : message buffer * kernel_pdir - kernel_pp2d-1 : kernel page directory * kernel_pp2d - . : kernel pointers to page directory * pmap_zero_copy_min - crashdumpmap-1 : reserved for page zero/copy * crashdumpmap - ptbl_buf_pool_vabase-1 : reserved for ptbl bufs * ptbl_buf_pool_vabase - virtual_avail-1 : user page directories and page tables * virtual_avail - 0xcfff_ffff_ffff_ffff : actual free KVA space * 0xd000_0000_0000_0000 - 0xdfff_ffff_ffff_ffff : coprocessor region * 0xe000_0000_0000_0000 - 0xefff_ffff_ffff_ffff : mmio region * 0xf000_0000_0000_0000 - 0xffff_ffff_ffff_ffff : direct map * 0xf000_0000_0000_0000 - +Maxmem : physmem map * - 0xffff_ffff_ffff_ffff : device direct map */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmu_if.h" #define SPARSE_MAPDEV /* Use power-of-two mappings in mmu_booke_mapdev(), to save entries. */ #define POW2_MAPPINGS #ifdef DEBUG #define debugf(fmt, args...) printf(fmt, ##args) #else #define debugf(fmt, args...) #endif #ifdef __powerpc64__ #define PRI0ptrX "016lx" #else #define PRI0ptrX "08x" #endif #define TODO panic("%s: not implemented", __func__); extern unsigned char _etext[]; extern unsigned char _end[]; extern uint32_t *bootinfo; vm_paddr_t kernload; vm_offset_t kernstart; vm_size_t kernsize; /* Message buffer and tables. */ static vm_offset_t data_start; static vm_size_t data_end; /* Phys/avail memory regions. */ static struct mem_region *availmem_regions; static int availmem_regions_sz; static struct mem_region *physmem_regions; static int physmem_regions_sz; #ifndef __powerpc64__ /* Reserved KVA space and mutex for mmu_booke_zero_page. */ static vm_offset_t zero_page_va; static struct mtx zero_page_mutex; /* Reserved KVA space and mutex for mmu_booke_copy_page. */ static vm_offset_t copy_page_src_va; static vm_offset_t copy_page_dst_va; static struct mtx copy_page_mutex; #endif static struct mtx tlbivax_mutex; /**************************************************************************/ /* PMAP */ /**************************************************************************/ static int mmu_booke_enter_locked(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int flags, int8_t psind); unsigned int kptbl_min; /* Index of the first kernel ptbl. */ static uma_zone_t ptbl_root_zone; /* * If user pmap is processed with mmu_booke_remove and the resident count * drops to 0, there are no more pages to remove, so we need not continue. */ #define PMAP_REMOVE_DONE(pmap) \ ((pmap) != kernel_pmap && (pmap)->pm_stats.resident_count == 0) #if defined(COMPAT_FREEBSD32) || !defined(__powerpc64__) extern int elf32_nxstack; #endif /**************************************************************************/ /* TLB and TID handling */ /**************************************************************************/ /* Translation ID busy table */ static volatile pmap_t tidbusy[MAXCPU][TID_MAX + 1]; /* * TLB0 capabilities (entry, way numbers etc.). These can vary between e500 * core revisions and should be read from h/w registers during early config. */ uint32_t tlb0_entries; uint32_t tlb0_ways; uint32_t tlb0_entries_per_way; uint32_t tlb1_entries; #define TLB0_ENTRIES (tlb0_entries) #define TLB0_WAYS (tlb0_ways) #define TLB0_ENTRIES_PER_WAY (tlb0_entries_per_way) #define TLB1_ENTRIES (tlb1_entries) static tlbtid_t tid_alloc(struct pmap *); #ifdef DDB #ifdef __powerpc64__ static void tlb_print_entry(int, uint32_t, uint64_t, uint32_t, uint32_t); #else static void tlb_print_entry(int, uint32_t, uint32_t, uint32_t, uint32_t); #endif #endif static void tlb1_read_entry(tlb_entry_t *, unsigned int); static void tlb1_write_entry(tlb_entry_t *, unsigned int); static int tlb1_iomapped(int, vm_paddr_t, vm_size_t, vm_offset_t *); static vm_size_t tlb1_mapin_region(vm_offset_t, vm_paddr_t, vm_size_t, int); static __inline uint32_t tlb_calc_wimg(vm_paddr_t pa, vm_memattr_t ma); static vm_size_t tsize2size(unsigned int); static unsigned int size2tsize(vm_size_t); static unsigned long ilog2(unsigned long); static void set_mas4_defaults(void); static inline void tlb0_flush_entry(vm_offset_t); static inline unsigned int tlb0_tableidx(vm_offset_t, unsigned int); /**************************************************************************/ /* Page table management */ /**************************************************************************/ static struct rwlock_padalign pvh_global_lock; /* Data for the pv entry allocation mechanism */ static uma_zone_t pvzone; static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; #define PV_ENTRY_ZONE_MIN 2048 /* min pv entries in uma zone */ #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif static vm_paddr_t pte_vatopa(mmu_t, pmap_t, vm_offset_t); static int pte_enter(mmu_t, pmap_t, vm_page_t, vm_offset_t, uint32_t, boolean_t); static int pte_remove(mmu_t, pmap_t, vm_offset_t, uint8_t); static pte_t *pte_find(mmu_t, pmap_t, vm_offset_t); static void kernel_pte_alloc(vm_offset_t, vm_offset_t); static pv_entry_t pv_alloc(void); static void pv_free(pv_entry_t); static void pv_insert(pmap_t, vm_offset_t, vm_page_t); static void pv_remove(pmap_t, vm_offset_t, vm_page_t); static void booke_pmap_init_qpages(void); static inline void tlb_miss_lock(void); static inline void tlb_miss_unlock(void); #ifdef SMP extern tlb_entry_t __boot_tlb1[]; void pmap_bootstrap_ap(volatile uint32_t *); #endif /* * Kernel MMU interface */ static void mmu_booke_clear_modify(mmu_t, vm_page_t); static void mmu_booke_copy(mmu_t, pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); static void mmu_booke_copy_page(mmu_t, vm_page_t, vm_page_t); static void mmu_booke_copy_pages(mmu_t, vm_page_t *, vm_offset_t, vm_page_t *, vm_offset_t, int); static int mmu_booke_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int flags, int8_t psind); static void mmu_booke_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); static void mmu_booke_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t); static vm_paddr_t mmu_booke_extract(mmu_t, pmap_t, vm_offset_t); static vm_page_t mmu_booke_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t); static void mmu_booke_init(mmu_t); static boolean_t mmu_booke_is_modified(mmu_t, vm_page_t); static boolean_t mmu_booke_is_prefaultable(mmu_t, pmap_t, vm_offset_t); static boolean_t mmu_booke_is_referenced(mmu_t, vm_page_t); static int mmu_booke_ts_referenced(mmu_t, vm_page_t); static vm_offset_t mmu_booke_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int); static int mmu_booke_mincore(mmu_t, pmap_t, vm_offset_t, vm_paddr_t *); static void mmu_booke_object_init_pt(mmu_t, pmap_t, vm_offset_t, vm_object_t, vm_pindex_t, vm_size_t); static boolean_t mmu_booke_page_exists_quick(mmu_t, pmap_t, vm_page_t); static void mmu_booke_page_init(mmu_t, vm_page_t); static int mmu_booke_page_wired_mappings(mmu_t, vm_page_t); static void mmu_booke_pinit(mmu_t, pmap_t); static void mmu_booke_pinit0(mmu_t, pmap_t); static void mmu_booke_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); static void mmu_booke_qenter(mmu_t, vm_offset_t, vm_page_t *, int); static void mmu_booke_qremove(mmu_t, vm_offset_t, int); static void mmu_booke_release(mmu_t, pmap_t); static void mmu_booke_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t); static void mmu_booke_remove_all(mmu_t, vm_page_t); static void mmu_booke_remove_write(mmu_t, vm_page_t); static void mmu_booke_unwire(mmu_t, pmap_t, vm_offset_t, vm_offset_t); static void mmu_booke_zero_page(mmu_t, vm_page_t); static void mmu_booke_zero_page_area(mmu_t, vm_page_t, int, int); static void mmu_booke_activate(mmu_t, struct thread *); static void mmu_booke_deactivate(mmu_t, struct thread *); static void mmu_booke_bootstrap(mmu_t, vm_offset_t, vm_offset_t); static void *mmu_booke_mapdev(mmu_t, vm_paddr_t, vm_size_t); static void *mmu_booke_mapdev_attr(mmu_t, vm_paddr_t, vm_size_t, vm_memattr_t); static void mmu_booke_unmapdev(mmu_t, vm_offset_t, vm_size_t); static vm_paddr_t mmu_booke_kextract(mmu_t, vm_offset_t); static void mmu_booke_kenter(mmu_t, vm_offset_t, vm_paddr_t); static void mmu_booke_kenter_attr(mmu_t, vm_offset_t, vm_paddr_t, vm_memattr_t); static void mmu_booke_kremove(mmu_t, vm_offset_t); static boolean_t mmu_booke_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t); static void mmu_booke_sync_icache(mmu_t, pmap_t, vm_offset_t, vm_size_t); static void mmu_booke_dumpsys_map(mmu_t, vm_paddr_t pa, size_t, void **); static void mmu_booke_dumpsys_unmap(mmu_t, vm_paddr_t pa, size_t, void *); static void mmu_booke_scan_init(mmu_t); static vm_offset_t mmu_booke_quick_enter_page(mmu_t mmu, vm_page_t m); static void mmu_booke_quick_remove_page(mmu_t mmu, vm_offset_t addr); static int mmu_booke_change_attr(mmu_t mmu, vm_offset_t addr, vm_size_t sz, vm_memattr_t mode); static int mmu_booke_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen); static int mmu_booke_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr); static void mmu_booke_page_array_startup(mmu_t , long); +static boolean_t mmu_booke_page_is_mapped(mmu_t mmu, vm_page_t m); static mmu_method_t mmu_booke_methods[] = { /* pmap dispatcher interface */ MMUMETHOD(mmu_clear_modify, mmu_booke_clear_modify), MMUMETHOD(mmu_copy, mmu_booke_copy), MMUMETHOD(mmu_copy_page, mmu_booke_copy_page), MMUMETHOD(mmu_copy_pages, mmu_booke_copy_pages), MMUMETHOD(mmu_enter, mmu_booke_enter), MMUMETHOD(mmu_enter_object, mmu_booke_enter_object), MMUMETHOD(mmu_enter_quick, mmu_booke_enter_quick), MMUMETHOD(mmu_extract, mmu_booke_extract), MMUMETHOD(mmu_extract_and_hold, mmu_booke_extract_and_hold), MMUMETHOD(mmu_init, mmu_booke_init), MMUMETHOD(mmu_is_modified, mmu_booke_is_modified), MMUMETHOD(mmu_is_prefaultable, mmu_booke_is_prefaultable), MMUMETHOD(mmu_is_referenced, mmu_booke_is_referenced), MMUMETHOD(mmu_ts_referenced, mmu_booke_ts_referenced), MMUMETHOD(mmu_map, mmu_booke_map), MMUMETHOD(mmu_mincore, mmu_booke_mincore), MMUMETHOD(mmu_object_init_pt, mmu_booke_object_init_pt), MMUMETHOD(mmu_page_exists_quick,mmu_booke_page_exists_quick), MMUMETHOD(mmu_page_init, mmu_booke_page_init), MMUMETHOD(mmu_page_wired_mappings, mmu_booke_page_wired_mappings), MMUMETHOD(mmu_pinit, mmu_booke_pinit), MMUMETHOD(mmu_pinit0, mmu_booke_pinit0), MMUMETHOD(mmu_protect, mmu_booke_protect), MMUMETHOD(mmu_qenter, mmu_booke_qenter), MMUMETHOD(mmu_qremove, mmu_booke_qremove), MMUMETHOD(mmu_release, mmu_booke_release), MMUMETHOD(mmu_remove, mmu_booke_remove), MMUMETHOD(mmu_remove_all, mmu_booke_remove_all), MMUMETHOD(mmu_remove_write, mmu_booke_remove_write), MMUMETHOD(mmu_sync_icache, mmu_booke_sync_icache), MMUMETHOD(mmu_unwire, mmu_booke_unwire), MMUMETHOD(mmu_zero_page, mmu_booke_zero_page), MMUMETHOD(mmu_zero_page_area, mmu_booke_zero_page_area), MMUMETHOD(mmu_activate, mmu_booke_activate), MMUMETHOD(mmu_deactivate, mmu_booke_deactivate), MMUMETHOD(mmu_quick_enter_page, mmu_booke_quick_enter_page), MMUMETHOD(mmu_quick_remove_page, mmu_booke_quick_remove_page), MMUMETHOD(mmu_page_array_startup, mmu_booke_page_array_startup), + MMUMETHOD(mmu_page_is_mapped, mmu_booke_page_is_mapped), /* Internal interfaces */ MMUMETHOD(mmu_bootstrap, mmu_booke_bootstrap), MMUMETHOD(mmu_dev_direct_mapped,mmu_booke_dev_direct_mapped), MMUMETHOD(mmu_mapdev, mmu_booke_mapdev), MMUMETHOD(mmu_mapdev_attr, mmu_booke_mapdev_attr), MMUMETHOD(mmu_kenter, mmu_booke_kenter), MMUMETHOD(mmu_kenter_attr, mmu_booke_kenter_attr), MMUMETHOD(mmu_kextract, mmu_booke_kextract), MMUMETHOD(mmu_kremove, mmu_booke_kremove), MMUMETHOD(mmu_unmapdev, mmu_booke_unmapdev), MMUMETHOD(mmu_change_attr, mmu_booke_change_attr), MMUMETHOD(mmu_map_user_ptr, mmu_booke_map_user_ptr), MMUMETHOD(mmu_decode_kernel_ptr, mmu_booke_decode_kernel_ptr), /* dumpsys() support */ MMUMETHOD(mmu_dumpsys_map, mmu_booke_dumpsys_map), MMUMETHOD(mmu_dumpsys_unmap, mmu_booke_dumpsys_unmap), MMUMETHOD(mmu_scan_init, mmu_booke_scan_init), { 0, 0 } }; MMU_DEF(booke_mmu, MMU_TYPE_BOOKE, mmu_booke_methods, 0); #ifdef __powerpc64__ #include "pmap_64.c" #else #include "pmap_32.c" #endif static vm_offset_t tlb1_map_base = VM_MAPDEV_BASE; static __inline uint32_t tlb_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) { uint32_t attrib; int i; if (ma != VM_MEMATTR_DEFAULT) { switch (ma) { case VM_MEMATTR_UNCACHEABLE: return (MAS2_I | MAS2_G); case VM_MEMATTR_WRITE_COMBINING: case VM_MEMATTR_WRITE_BACK: case VM_MEMATTR_PREFETCHABLE: return (MAS2_I); case VM_MEMATTR_WRITE_THROUGH: return (MAS2_W | MAS2_M); case VM_MEMATTR_CACHEABLE: return (MAS2_M); } } /* * Assume the page is cache inhibited and access is guarded unless * it's in our available memory array. */ attrib = _TLB_ENTRY_IO; for (i = 0; i < physmem_regions_sz; i++) { if ((pa >= physmem_regions[i].mr_start) && (pa < (physmem_regions[i].mr_start + physmem_regions[i].mr_size))) { attrib = _TLB_ENTRY_MEM; break; } } return (attrib); } static inline void tlb_miss_lock(void) { #ifdef SMP struct pcpu *pc; if (!smp_started) return; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (pc != pcpup) { CTR3(KTR_PMAP, "%s: tlb miss LOCK of CPU=%d, " "tlb_lock=%p", __func__, pc->pc_cpuid, pc->pc_booke.tlb_lock); KASSERT((pc->pc_cpuid != PCPU_GET(cpuid)), ("tlb_miss_lock: tried to lock self")); tlb_lock(pc->pc_booke.tlb_lock); CTR1(KTR_PMAP, "%s: locked", __func__); } } #endif } static inline void tlb_miss_unlock(void) { #ifdef SMP struct pcpu *pc; if (!smp_started) return; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (pc != pcpup) { CTR2(KTR_PMAP, "%s: tlb miss UNLOCK of CPU=%d", __func__, pc->pc_cpuid); tlb_unlock(pc->pc_booke.tlb_lock); CTR1(KTR_PMAP, "%s: unlocked", __func__); } } #endif } /* Return number of entries in TLB0. */ static __inline void tlb0_get_tlbconf(void) { uint32_t tlb0_cfg; tlb0_cfg = mfspr(SPR_TLB0CFG); tlb0_entries = tlb0_cfg & TLBCFG_NENTRY_MASK; tlb0_ways = (tlb0_cfg & TLBCFG_ASSOC_MASK) >> TLBCFG_ASSOC_SHIFT; tlb0_entries_per_way = tlb0_entries / tlb0_ways; } /* Return number of entries in TLB1. */ static __inline void tlb1_get_tlbconf(void) { uint32_t tlb1_cfg; tlb1_cfg = mfspr(SPR_TLB1CFG); tlb1_entries = tlb1_cfg & TLBCFG_NENTRY_MASK; } /**************************************************************************/ /* Page table related */ /**************************************************************************/ /* Allocate pv_entry structure. */ pv_entry_t pv_alloc(void) { pv_entry_t pv; pv_entry_count++; if (pv_entry_count > pv_entry_high_water) pagedaemon_wakeup(0); /* XXX powerpc NUMA */ pv = uma_zalloc(pvzone, M_NOWAIT); return (pv); } /* Free pv_entry structure. */ static __inline void pv_free(pv_entry_t pve) { pv_entry_count--; uma_zfree(pvzone, pve); } /* Allocate and initialize pv_entry structure. */ static void pv_insert(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pve; //int su = (pmap == kernel_pmap); //debugf("pv_insert: s (su = %d pmap = 0x%08x va = 0x%08x m = 0x%08x)\n", su, // (u_int32_t)pmap, va, (u_int32_t)m); pve = pv_alloc(); if (pve == NULL) panic("pv_insert: no pv entries!"); pve->pv_pmap = pmap; pve->pv_va = va; /* add to pv_list */ PMAP_LOCK_ASSERT(pmap, MA_OWNED); rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_INSERT_TAIL(&m->md.pv_list, pve, pv_link); //debugf("pv_insert: e\n"); } /* Destroy pv entry. */ static void pv_remove(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pve; //int su = (pmap == kernel_pmap); //debugf("pv_remove: s (su = %d pmap = 0x%08x va = 0x%08x)\n", su, (u_int32_t)pmap, va); PMAP_LOCK_ASSERT(pmap, MA_OWNED); rw_assert(&pvh_global_lock, RA_WLOCKED); /* find pv entry */ TAILQ_FOREACH(pve, &m->md.pv_list, pv_link) { if ((pmap == pve->pv_pmap) && (va == pve->pv_va)) { /* remove from pv_list */ TAILQ_REMOVE(&m->md.pv_list, pve, pv_link); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); /* free pv entry struct */ pv_free(pve); break; } } //debugf("pv_remove: e\n"); } /**************************************************************************/ /* PMAP related */ /**************************************************************************/ /* * This is called during booke_init, before the system is really initialized. */ static void mmu_booke_bootstrap(mmu_t mmu, vm_offset_t start, vm_offset_t kernelend) { vm_paddr_t phys_kernelend; struct mem_region *mp, *mp1; int cnt, i, j; vm_paddr_t s, e, sz; vm_paddr_t physsz, hwphyssz; u_int phys_avail_count; vm_size_t kstack0_sz; vm_paddr_t kstack0_phys; vm_offset_t kstack0; void *dpcpu; debugf("mmu_booke_bootstrap: entered\n"); /* Set interesting system properties */ #ifdef __powerpc64__ hw_direct_map = 1; #else hw_direct_map = 0; #endif #if defined(COMPAT_FREEBSD32) || !defined(__powerpc64__) elf32_nxstack = 1; #endif /* Initialize invalidation mutex */ mtx_init(&tlbivax_mutex, "tlbivax", NULL, MTX_SPIN); /* Read TLB0 size and associativity. */ tlb0_get_tlbconf(); /* * Align kernel start and end address (kernel image). * Note that kernel end does not necessarily relate to kernsize. * kernsize is the size of the kernel that is actually mapped. */ data_start = round_page(kernelend); data_end = data_start; /* Allocate the dynamic per-cpu area. */ dpcpu = (void *)data_end; data_end += DPCPU_SIZE; /* Allocate space for the message buffer. */ msgbufp = (struct msgbuf *)data_end; data_end += msgbufsize; debugf(" msgbufp at 0x%"PRI0ptrX" end = 0x%"PRI0ptrX"\n", (uintptr_t)msgbufp, data_end); data_end = round_page(data_end); data_end = round_page(mmu_booke_alloc_kernel_pgtables(data_end)); /* Retrieve phys/avail mem regions */ mem_regions(&physmem_regions, &physmem_regions_sz, &availmem_regions, &availmem_regions_sz); if (PHYS_AVAIL_ENTRIES < availmem_regions_sz) panic("mmu_booke_bootstrap: phys_avail too small"); data_end = round_page(data_end); vm_page_array = (vm_page_t)data_end; /* * Get a rough idea (upper bound) on the size of the page array. The * vm_page_array will not handle any more pages than we have in the * avail_regions array, and most likely much less. */ sz = 0; for (mp = availmem_regions; mp->mr_size; mp++) { sz += mp->mr_size; } sz = (round_page(sz) / (PAGE_SIZE + sizeof(struct vm_page))); data_end += round_page(sz * sizeof(struct vm_page)); /* Pre-round up to 1MB. This wastes some space, but saves TLB entries */ data_end = roundup2(data_end, 1 << 20); debugf(" data_end: 0x%"PRI0ptrX"\n", data_end); debugf(" kernstart: %#zx\n", kernstart); debugf(" kernsize: %#zx\n", kernsize); if (data_end - kernstart > kernsize) { kernsize += tlb1_mapin_region(kernstart + kernsize, kernload + kernsize, (data_end - kernstart) - kernsize, _TLB_ENTRY_MEM); } data_end = kernstart + kernsize; debugf(" updated data_end: 0x%"PRI0ptrX"\n", data_end); /* * Clear the structures - note we can only do it safely after the * possible additional TLB1 translations are in place (above) so that * all range up to the currently calculated 'data_end' is covered. */ bzero((void *)data_start, data_end - data_start); dpcpu_init(dpcpu, 0); /*******************************************************/ /* Set the start and end of kva. */ /*******************************************************/ virtual_avail = round_page(data_end); virtual_end = VM_MAX_KERNEL_ADDRESS; #ifndef __powerpc64__ /* Allocate KVA space for page zero/copy operations. */ zero_page_va = virtual_avail; virtual_avail += PAGE_SIZE; copy_page_src_va = virtual_avail; virtual_avail += PAGE_SIZE; copy_page_dst_va = virtual_avail; virtual_avail += PAGE_SIZE; debugf("zero_page_va = 0x%"PRI0ptrX"\n", zero_page_va); debugf("copy_page_src_va = 0x%"PRI0ptrX"\n", copy_page_src_va); debugf("copy_page_dst_va = 0x%"PRI0ptrX"\n", copy_page_dst_va); /* Initialize page zero/copy mutexes. */ mtx_init(&zero_page_mutex, "mmu_booke_zero_page", NULL, MTX_DEF); mtx_init(©_page_mutex, "mmu_booke_copy_page", NULL, MTX_DEF); /* Allocate KVA space for ptbl bufs. */ ptbl_buf_pool_vabase = virtual_avail; virtual_avail += PTBL_BUFS * PTBL_PAGES * PAGE_SIZE; debugf("ptbl_buf_pool_vabase = 0x%"PRI0ptrX" end = 0x%"PRI0ptrX"\n", ptbl_buf_pool_vabase, virtual_avail); #endif /* Calculate corresponding physical addresses for the kernel region. */ phys_kernelend = kernload + kernsize; debugf("kernel image and allocated data:\n"); debugf(" kernload = 0x%09jx\n", (uintmax_t)kernload); debugf(" kernstart = 0x%"PRI0ptrX"\n", kernstart); debugf(" kernsize = 0x%"PRI0ptrX"\n", kernsize); /* * Remove kernel physical address range from avail regions list. Page * align all regions. Non-page aligned memory isn't very interesting * to us. Also, sort the entries for ascending addresses. */ sz = 0; cnt = availmem_regions_sz; debugf("processing avail regions:\n"); for (mp = availmem_regions; mp->mr_size; mp++) { s = mp->mr_start; e = mp->mr_start + mp->mr_size; debugf(" %09jx-%09jx -> ", (uintmax_t)s, (uintmax_t)e); /* Check whether this region holds all of the kernel. */ if (s < kernload && e > phys_kernelend) { availmem_regions[cnt].mr_start = phys_kernelend; availmem_regions[cnt++].mr_size = e - phys_kernelend; e = kernload; } /* Look whether this regions starts within the kernel. */ if (s >= kernload && s < phys_kernelend) { if (e <= phys_kernelend) goto empty; s = phys_kernelend; } /* Now look whether this region ends within the kernel. */ if (e > kernload && e <= phys_kernelend) { if (s >= kernload) goto empty; e = kernload; } /* Now page align the start and size of the region. */ s = round_page(s); e = trunc_page(e); if (e < s) e = s; sz = e - s; debugf("%09jx-%09jx = %jx\n", (uintmax_t)s, (uintmax_t)e, (uintmax_t)sz); /* Check whether some memory is left here. */ if (sz == 0) { empty: memmove(mp, mp + 1, (cnt - (mp - availmem_regions)) * sizeof(*mp)); cnt--; mp--; continue; } /* Do an insertion sort. */ for (mp1 = availmem_regions; mp1 < mp; mp1++) if (s < mp1->mr_start) break; if (mp1 < mp) { memmove(mp1 + 1, mp1, (char *)mp - (char *)mp1); mp1->mr_start = s; mp1->mr_size = sz; } else { mp->mr_start = s; mp->mr_size = sz; } } availmem_regions_sz = cnt; /*******************************************************/ /* Steal physical memory for kernel stack from the end */ /* of the first avail region */ /*******************************************************/ kstack0_sz = kstack_pages * PAGE_SIZE; kstack0_phys = availmem_regions[0].mr_start + availmem_regions[0].mr_size; kstack0_phys -= kstack0_sz; availmem_regions[0].mr_size -= kstack0_sz; /*******************************************************/ /* Fill in phys_avail table, based on availmem_regions */ /*******************************************************/ phys_avail_count = 0; physsz = 0; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); debugf("fill in phys_avail:\n"); for (i = 0, j = 0; i < availmem_regions_sz; i++, j += 2) { debugf(" region: 0x%jx - 0x%jx (0x%jx)\n", (uintmax_t)availmem_regions[i].mr_start, (uintmax_t)availmem_regions[i].mr_start + availmem_regions[i].mr_size, (uintmax_t)availmem_regions[i].mr_size); if (hwphyssz != 0 && (physsz + availmem_regions[i].mr_size) >= hwphyssz) { debugf(" hw.physmem adjust\n"); if (physsz < hwphyssz) { phys_avail[j] = availmem_regions[i].mr_start; phys_avail[j + 1] = availmem_regions[i].mr_start + hwphyssz - physsz; physsz = hwphyssz; phys_avail_count++; dump_avail[j] = phys_avail[j]; dump_avail[j + 1] = phys_avail[j + 1]; } break; } phys_avail[j] = availmem_regions[i].mr_start; phys_avail[j + 1] = availmem_regions[i].mr_start + availmem_regions[i].mr_size; phys_avail_count++; physsz += availmem_regions[i].mr_size; dump_avail[j] = phys_avail[j]; dump_avail[j + 1] = phys_avail[j + 1]; } physmem = btoc(physsz); /* Calculate the last available physical address. */ for (i = 0; phys_avail[i + 2] != 0; i += 2) ; Maxmem = powerpc_btop(phys_avail[i + 1]); debugf("Maxmem = 0x%08lx\n", Maxmem); debugf("phys_avail_count = %d\n", phys_avail_count); debugf("physsz = 0x%09jx physmem = %jd (0x%09jx)\n", (uintmax_t)physsz, (uintmax_t)physmem, (uintmax_t)physmem); #ifdef __powerpc64__ /* * Map the physical memory contiguously in TLB1. * Round so it fits into a single mapping. */ tlb1_mapin_region(DMAP_BASE_ADDRESS, 0, phys_avail[i + 1], _TLB_ENTRY_MEM); #endif /*******************************************************/ /* Initialize (statically allocated) kernel pmap. */ /*******************************************************/ PMAP_LOCK_INIT(kernel_pmap); debugf("kernel_pmap = 0x%"PRI0ptrX"\n", (uintptr_t)kernel_pmap); kernel_pte_alloc(virtual_avail, kernstart); for (i = 0; i < MAXCPU; i++) { kernel_pmap->pm_tid[i] = TID_KERNEL; /* Initialize each CPU's tidbusy entry 0 with kernel_pmap */ tidbusy[i][TID_KERNEL] = kernel_pmap; } /* Mark kernel_pmap active on all CPUs */ CPU_FILL(&kernel_pmap->pm_active); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); /*******************************************************/ /* Final setup */ /*******************************************************/ /* Enter kstack0 into kernel map, provide guard page */ kstack0 = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; thread0.td_kstack = kstack0; thread0.td_kstack_pages = kstack_pages; debugf("kstack_sz = 0x%08jx\n", (uintmax_t)kstack0_sz); debugf("kstack0_phys at 0x%09jx - 0x%09jx\n", (uintmax_t)kstack0_phys, (uintmax_t)kstack0_phys + kstack0_sz); debugf("kstack0 at 0x%"PRI0ptrX" - 0x%"PRI0ptrX"\n", kstack0, kstack0 + kstack0_sz); virtual_avail += KSTACK_GUARD_PAGES * PAGE_SIZE + kstack0_sz; for (i = 0; i < kstack_pages; i++) { mmu_booke_kenter(mmu, kstack0, kstack0_phys); kstack0 += PAGE_SIZE; kstack0_phys += PAGE_SIZE; } pmap_bootstrapped = 1; debugf("virtual_avail = %"PRI0ptrX"\n", virtual_avail); debugf("virtual_end = %"PRI0ptrX"\n", virtual_end); debugf("mmu_booke_bootstrap: exit\n"); } #ifdef SMP void tlb1_ap_prep(void) { tlb_entry_t *e, tmp; unsigned int i; /* Prepare TLB1 image for AP processors */ e = __boot_tlb1; for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&tmp, i); if ((tmp.mas1 & MAS1_VALID) && (tmp.mas2 & _TLB_ENTRY_SHARED)) memcpy(e++, &tmp, sizeof(tmp)); } } void pmap_bootstrap_ap(volatile uint32_t *trcp __unused) { int i; /* * Finish TLB1 configuration: the BSP already set up its TLB1 and we * have the snapshot of its contents in the s/w __boot_tlb1[] table * created by tlb1_ap_prep(), so use these values directly to * (re)program AP's TLB1 hardware. * * Start at index 1 because index 0 has the kernel map. */ for (i = 1; i < TLB1_ENTRIES; i++) { if (__boot_tlb1[i].mas1 & MAS1_VALID) tlb1_write_entry(&__boot_tlb1[i], i); } set_mas4_defaults(); } #endif static void booke_pmap_init_qpages(void) { struct pcpu *pc; int i; CPU_FOREACH(i) { pc = pcpu_find(i); pc->pc_qmap_addr = kva_alloc(PAGE_SIZE); if (pc->pc_qmap_addr == 0) panic("pmap_init_qpages: unable to allocate KVA"); } } SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, booke_pmap_init_qpages, NULL); /* * Get the physical page address for the given pmap/virtual address. */ static vm_paddr_t mmu_booke_extract(mmu_t mmu, pmap_t pmap, vm_offset_t va) { vm_paddr_t pa; PMAP_LOCK(pmap); pa = pte_vatopa(mmu, pmap, va); PMAP_UNLOCK(pmap); return (pa); } /* * Extract the physical page address associated with the given * kernel virtual address. */ static vm_paddr_t mmu_booke_kextract(mmu_t mmu, vm_offset_t va) { tlb_entry_t e; vm_paddr_t p = 0; int i; #ifdef __powerpc64__ if (va >= DMAP_BASE_ADDRESS && va <= DMAP_MAX_ADDRESS) return (DMAP_TO_PHYS(va)); #endif if (va >= VM_MIN_KERNEL_ADDRESS && va <= VM_MAX_KERNEL_ADDRESS) p = pte_vatopa(mmu, kernel_pmap, va); if (p == 0) { /* Check TLB1 mappings */ for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) continue; if (va >= e.virt && va < e.virt + e.size) return (e.phys + (va - e.virt)); } } return (p); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ static void mmu_booke_init(mmu_t mmu) { int shpgperproc = PMAP_SHPGPERPROC; /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_high_water = 9 * (pv_entry_max / 10); uma_zone_reserve_kva(pvzone, pv_entry_max); /* Pre-fill pvzone with initial number of pv entries. */ uma_prealloc(pvzone, PV_ENTRY_ZONE_MIN); /* Create a UMA zone for page table roots. */ ptbl_root_zone = uma_zcreate("pmap root", PMAP_ROOT_SIZE, NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, UMA_ZONE_VM); /* Initialize ptbl allocation. */ ptbl_init(); } /* * Map a list of wired pages into kernel virtual address space. This is * intended for temporary mappings which do not need page modification or * references recorded. Existing mappings in the region are overwritten. */ static void mmu_booke_qenter(mmu_t mmu, vm_offset_t sva, vm_page_t *m, int count) { vm_offset_t va; va = sva; while (count-- > 0) { mmu_booke_kenter(mmu, va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } } /* * Remove page mappings from kernel virtual address space. Intended for * temporary mappings entered by mmu_booke_qenter. */ static void mmu_booke_qremove(mmu_t mmu, vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { mmu_booke_kremove(mmu, va); va += PAGE_SIZE; } } /* * Map a wired page into kernel virtual address space. */ static void mmu_booke_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa) { mmu_booke_kenter_attr(mmu, va, pa, VM_MEMATTR_DEFAULT); } static void mmu_booke_kenter_attr(mmu_t mmu, vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { uint32_t flags; pte_t *pte; KASSERT(((va >= VM_MIN_KERNEL_ADDRESS) && (va <= VM_MAX_KERNEL_ADDRESS)), ("mmu_booke_kenter: invalid va")); flags = PTE_SR | PTE_SW | PTE_SX | PTE_WIRED | PTE_VALID; flags |= tlb_calc_wimg(pa, ma) << PTE_MAS2_SHIFT; flags |= PTE_PS_4KB; pte = pte_find(mmu, kernel_pmap, va); KASSERT((pte != NULL), ("mmu_booke_kenter: invalid va. NULL PTE")); mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); if (PTE_ISVALID(pte)) { CTR1(KTR_PMAP, "%s: replacing entry!", __func__); /* Flush entry from TLB0 */ tlb0_flush_entry(va); } *pte = PTE_RPN_FROM_PA(pa) | flags; //debugf("mmu_booke_kenter: pdir_idx = %d ptbl_idx = %d va=0x%08x " // "pa=0x%08x rpn=0x%08x flags=0x%08x\n", // pdir_idx, ptbl_idx, va, pa, pte->rpn, pte->flags); /* Flush the real memory from the instruction cache. */ if ((flags & (PTE_I | PTE_G)) == 0) __syncicache((void *)va, PAGE_SIZE); tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } /* * Remove a page from kernel page table. */ static void mmu_booke_kremove(mmu_t mmu, vm_offset_t va) { pte_t *pte; CTR2(KTR_PMAP,"%s: s (va = 0x%"PRI0ptrX")\n", __func__, va); KASSERT(((va >= VM_MIN_KERNEL_ADDRESS) && (va <= VM_MAX_KERNEL_ADDRESS)), ("mmu_booke_kremove: invalid va")); pte = pte_find(mmu, kernel_pmap, va); if (!PTE_ISVALID(pte)) { CTR1(KTR_PMAP, "%s: invalid pte", __func__); return; } mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); /* Invalidate entry in TLB0, update PTE. */ tlb0_flush_entry(va); *pte = 0; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } /* * Provide a kernel pointer corresponding to a given userland pointer. * The returned pointer is valid until the next time this function is * called in this thread. This is used internally in copyin/copyout. */ int mmu_booke_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen) { if (trunc_page((uintptr_t)uaddr + ulen) > VM_MAXUSER_ADDRESS) return (EFAULT); *kaddr = (void *)(uintptr_t)uaddr; if (klen) *klen = ulen; return (0); } /* * Figure out where a given kernel pointer (usually in a fault) points * to from the VM's perspective, potentially remapping into userland's * address space. */ static int mmu_booke_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr) { if (trunc_page(addr) <= VM_MAXUSER_ADDRESS) *is_user = 1; else *is_user = 0; *decoded_addr = addr; return (0); +} + +static boolean_t +mmu_booke_page_is_mapped(mmu_t mmu, vm_page_t m) +{ + + return (!TAILQ_EMPTY(&(m)->md.pv_list)); } /* * Initialize pmap associated with process 0. */ static void mmu_booke_pinit0(mmu_t mmu, pmap_t pmap) { PMAP_LOCK_INIT(pmap); mmu_booke_pinit(mmu, pmap); PCPU_SET(curpmap, pmap); } /* * Insert the given physical page at the specified virtual address in the * target physical map with the protection requested. If specified the page * will be wired down. */ static int mmu_booke_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { int error; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); error = mmu_booke_enter_locked(mmu, pmap, va, m, prot, flags, psind); PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); return (error); } static int mmu_booke_enter_locked(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int pmap_flags, int8_t psind __unused) { pte_t *pte; vm_paddr_t pa; pte_t flags; int error, su, sync; pa = VM_PAGE_TO_PHYS(m); su = (pmap == kernel_pmap); sync = 0; //debugf("mmu_booke_enter_locked: s (pmap=0x%08x su=%d tid=%d m=0x%08x va=0x%08x " // "pa=0x%08x prot=0x%08x flags=%#x)\n", // (u_int32_t)pmap, su, pmap->pm_tid, // (u_int32_t)m, va, pa, prot, flags); if (su) { KASSERT(((va >= virtual_avail) && (va <= VM_MAX_KERNEL_ADDRESS)), ("mmu_booke_enter_locked: kernel pmap, non kernel va")); } else { KASSERT((va <= VM_MAXUSER_ADDRESS), ("mmu_booke_enter_locked: user pmap, non user va")); } if ((m->oflags & VPO_UNMANAGED) == 0) { if ((pmap_flags & PMAP_ENTER_QUICK_LOCKED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); else VM_OBJECT_ASSERT_LOCKED(m->object); } PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * If there is an existing mapping, and the physical address has not * changed, must be protection or wiring change. */ if (((pte = pte_find(mmu, pmap, va)) != NULL) && (PTE_ISVALID(pte)) && (PTE_PA(pte) == pa)) { /* * Before actually updating pte->flags we calculate and * prepare its new value in a helper var. */ flags = *pte; flags &= ~(PTE_UW | PTE_UX | PTE_SW | PTE_SX | PTE_MODIFIED); /* Wiring change, just update stats. */ if ((pmap_flags & PMAP_ENTER_WIRED) != 0) { if (!PTE_ISWIRED(pte)) { flags |= PTE_WIRED; pmap->pm_stats.wired_count++; } } else { if (PTE_ISWIRED(pte)) { flags &= ~PTE_WIRED; pmap->pm_stats.wired_count--; } } if (prot & VM_PROT_WRITE) { /* Add write permissions. */ flags |= PTE_SW; if (!su) flags |= PTE_UW; if ((flags & PTE_MANAGED) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } else { /* Handle modified pages, sense modify status. */ /* * The PTE_MODIFIED flag could be set by underlying * TLB misses since we last read it (above), possibly * other CPUs could update it so we check in the PTE * directly rather than rely on that saved local flags * copy. */ if (PTE_ISMODIFIED(pte)) vm_page_dirty(m); } if (prot & VM_PROT_EXECUTE) { flags |= PTE_SX; if (!su) flags |= PTE_UX; /* * Check existing flags for execute permissions: if we * are turning execute permissions on, icache should * be flushed. */ if ((*pte & (PTE_UX | PTE_SX)) == 0) sync++; } flags &= ~PTE_REFERENCED; /* * The new flags value is all calculated -- only now actually * update the PTE. */ mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); tlb0_flush_entry(va); *pte &= ~PTE_FLAGS_MASK; *pte |= flags; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } else { /* * If there is an existing mapping, but it's for a different * physical address, pte_enter() will delete the old mapping. */ //if ((pte != NULL) && PTE_ISVALID(pte)) // debugf("mmu_booke_enter_locked: replace\n"); //else // debugf("mmu_booke_enter_locked: new\n"); /* Now set up the flags and install the new mapping. */ flags = (PTE_SR | PTE_VALID); flags |= PTE_M; if (!su) flags |= PTE_UR; if (prot & VM_PROT_WRITE) { flags |= PTE_SW; if (!su) flags |= PTE_UW; if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); } if (prot & VM_PROT_EXECUTE) { flags |= PTE_SX; if (!su) flags |= PTE_UX; } /* If its wired update stats. */ if ((pmap_flags & PMAP_ENTER_WIRED) != 0) flags |= PTE_WIRED; error = pte_enter(mmu, pmap, m, va, flags, (pmap_flags & PMAP_ENTER_NOSLEEP) != 0); if (error != 0) return (KERN_RESOURCE_SHORTAGE); if ((flags & PMAP_ENTER_WIRED) != 0) pmap->pm_stats.wired_count++; /* Flush the real memory from the instruction cache. */ if (prot & VM_PROT_EXECUTE) sync++; } if (sync && (su || pmap == PCPU_GET(curpmap))) { __syncicache((void *)va, PAGE_SIZE); sync = 0; } return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ static void mmu_booke_enter_object(mmu_t mmu, pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { mmu_booke_enter_locked(mmu, pmap, start + ptoa(diff), m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, 0); m = TAILQ_NEXT(m, listq); } PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); } static void mmu_booke_enter_quick(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); mmu_booke_enter_locked(mmu, pmap, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, 0); PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly rounded to the page size. */ static void mmu_booke_remove(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_offset_t endva) { pte_t *pte; uint8_t hold_flag; int su = (pmap == kernel_pmap); //debugf("mmu_booke_remove: s (su = %d pmap=0x%08x tid=%d va=0x%08x endva=0x%08x)\n", // su, (u_int32_t)pmap, pmap->pm_tid, va, endva); if (su) { KASSERT(((va >= virtual_avail) && (va <= VM_MAX_KERNEL_ADDRESS)), ("mmu_booke_remove: kernel pmap, non kernel va")); } else { KASSERT((va <= VM_MAXUSER_ADDRESS), ("mmu_booke_remove: user pmap, non user va")); } if (PMAP_REMOVE_DONE(pmap)) { //debugf("mmu_booke_remove: e (empty)\n"); return; } hold_flag = PTBL_HOLD_FLAG(pmap); //debugf("mmu_booke_remove: hold_flag = %d\n", hold_flag); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); for (; va < endva; va += PAGE_SIZE) { pte = pte_find_next(mmu, pmap, &va); if ((pte == NULL) || !PTE_ISVALID(pte)) break; if (va >= endva) break; pte_remove(mmu, pmap, va, hold_flag); } PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); //debugf("mmu_booke_remove: e\n"); } /* * Remove physical page from all pmaps in which it resides. */ static void mmu_booke_remove_all(mmu_t mmu, vm_page_t m) { pv_entry_t pv, pvn; uint8_t hold_flag; rw_wlock(&pvh_global_lock); TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_link, pvn) { PMAP_LOCK(pv->pv_pmap); hold_flag = PTBL_HOLD_FLAG(pv->pv_pmap); pte_remove(mmu, pv->pv_pmap, pv->pv_va, hold_flag); PMAP_UNLOCK(pv->pv_pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); } /* * Map a range of physical addresses into kernel virtual address space. */ static vm_offset_t mmu_booke_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start, vm_paddr_t pa_end, int prot) { vm_offset_t sva = *virt; vm_offset_t va = sva; #ifdef __powerpc64__ /* XXX: Handle memory not starting at 0x0. */ if (pa_end < ctob(Maxmem)) return (PHYS_TO_DMAP(pa_start)); #endif while (pa_start < pa_end) { mmu_booke_kenter(mmu, va, pa_start); va += PAGE_SIZE; pa_start += PAGE_SIZE; } *virt = va; return (sva); } /* * The pmap must be activated before it's address space can be accessed in any * way. */ static void mmu_booke_activate(mmu_t mmu, struct thread *td) { pmap_t pmap; u_int cpuid; pmap = &td->td_proc->p_vmspace->vm_pmap; CTR5(KTR_PMAP, "%s: s (td = %p, proc = '%s', id = %d, pmap = 0x%"PRI0ptrX")", __func__, td, td->td_proc->p_comm, td->td_proc->p_pid, pmap); KASSERT((pmap != kernel_pmap), ("mmu_booke_activate: kernel_pmap!")); sched_pin(); cpuid = PCPU_GET(cpuid); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); PCPU_SET(curpmap, pmap); if (pmap->pm_tid[cpuid] == TID_NONE) tid_alloc(pmap); /* Load PID0 register with pmap tid value. */ mtspr(SPR_PID0, pmap->pm_tid[cpuid]); __asm __volatile("isync"); mtspr(SPR_DBCR0, td->td_pcb->pcb_cpu.booke.dbcr0); sched_unpin(); CTR3(KTR_PMAP, "%s: e (tid = %d for '%s')", __func__, pmap->pm_tid[PCPU_GET(cpuid)], td->td_proc->p_comm); } /* * Deactivate the specified process's address space. */ static void mmu_booke_deactivate(mmu_t mmu, struct thread *td) { pmap_t pmap; pmap = &td->td_proc->p_vmspace->vm_pmap; CTR5(KTR_PMAP, "%s: td=%p, proc = '%s', id = %d, pmap = 0x%"PRI0ptrX, __func__, td, td->td_proc->p_comm, td->td_proc->p_pid, pmap); td->td_pcb->pcb_cpu.booke.dbcr0 = mfspr(SPR_DBCR0); CPU_CLR_ATOMIC(PCPU_GET(cpuid), &pmap->pm_active); PCPU_SET(curpmap, NULL); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ static void mmu_booke_copy(mmu_t mmu, pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * Set the physical protection on the specified range of this map as requested. */ static void mmu_booke_protect(mmu_t mmu, pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va; vm_page_t m; pte_t *pte; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { mmu_booke_remove(mmu, pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; PMAP_LOCK(pmap); for (va = sva; va < eva; va += PAGE_SIZE) { if ((pte = pte_find(mmu, pmap, va)) != NULL) { if (PTE_ISVALID(pte)) { m = PHYS_TO_VM_PAGE(PTE_PA(pte)); mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); /* Handle modified pages. */ if (PTE_ISMODIFIED(pte) && PTE_ISMANAGED(pte)) vm_page_dirty(m); tlb0_flush_entry(va); *pte &= ~(PTE_UW | PTE_SW | PTE_MODIFIED); tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } } } PMAP_UNLOCK(pmap); } /* * Clear the write and modified bits in each of the given page's mappings. */ static void mmu_booke_remove_write(mmu_t mmu, vm_page_t m) { pv_entry_t pv; pte_t *pte; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL) { if (PTE_ISVALID(pte)) { m = PHYS_TO_VM_PAGE(PTE_PA(pte)); mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); /* Handle modified pages. */ if (PTE_ISMODIFIED(pte)) vm_page_dirty(m); /* Flush mapping from TLB0. */ *pte &= ~(PTE_UW | PTE_SW | PTE_MODIFIED); tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } } PMAP_UNLOCK(pv->pv_pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. */ static vm_page_t mmu_booke_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pte_t *pte; vm_page_t m; uint32_t pte_wbit; m = NULL; PMAP_LOCK(pmap); pte = pte_find(mmu, pmap, va); if ((pte != NULL) && PTE_ISVALID(pte)) { if (pmap == kernel_pmap) pte_wbit = PTE_SW; else pte_wbit = PTE_UW; if ((*pte & pte_wbit) != 0 || (prot & VM_PROT_WRITE) == 0) { m = PHYS_TO_VM_PAGE(PTE_PA(pte)); if (!vm_page_wire_mapped(m)) m = NULL; } } PMAP_UNLOCK(pmap); return (m); } /* * Initialize a vm_page's machine-dependent fields. */ static void mmu_booke_page_init(mmu_t mmu, vm_page_t m) { m->md.pv_tracked = 0; TAILQ_INIT(&m->md.pv_list); } /* * Return whether or not the specified physical page was modified * in any of physical maps. */ static boolean_t mmu_booke_is_modified(mmu_t mmu, vm_page_t m) { pte_t *pte; pv_entry_t pv; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_is_modified: page %p is not managed", m)); rv = FALSE; /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL && PTE_ISVALID(pte)) { if (PTE_ISMODIFIED(pte)) rv = TRUE; } PMAP_UNLOCK(pv->pv_pmap); if (rv) break; } rw_wunlock(&pvh_global_lock); return (rv); } /* * Return whether or not the specified virtual address is eligible * for prefault. */ static boolean_t mmu_booke_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t addr) { return (FALSE); } /* * Return whether or not the specified physical page was referenced * in any physical maps. */ static boolean_t mmu_booke_is_referenced(mmu_t mmu, vm_page_t m) { pte_t *pte; pv_entry_t pv; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_is_referenced: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL && PTE_ISVALID(pte)) { if (PTE_ISREFERENCED(pte)) rv = TRUE; } PMAP_UNLOCK(pv->pv_pmap); if (rv) break; } rw_wunlock(&pvh_global_lock); return (rv); } /* * Clear the modify bits on the specified physical page. */ static void mmu_booke_clear_modify(mmu_t mmu, vm_page_t m) { pte_t *pte; pv_entry_t pv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL && PTE_ISVALID(pte)) { mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); if (*pte & (PTE_SW | PTE_UW | PTE_MODIFIED)) { tlb0_flush_entry(pv->pv_va); *pte &= ~(PTE_SW | PTE_UW | PTE_MODIFIED | PTE_REFERENCED); } tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } PMAP_UNLOCK(pv->pv_pmap); } rw_wunlock(&pvh_global_lock); } /* * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ static int mmu_booke_ts_referenced(mmu_t mmu, vm_page_t m) { pte_t *pte; pv_entry_t pv; int count; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_ts_referenced: page %p is not managed", m)); count = 0; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL && PTE_ISVALID(pte)) { if (PTE_ISMODIFIED(pte)) vm_page_dirty(m); if (PTE_ISREFERENCED(pte)) { mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); tlb0_flush_entry(pv->pv_va); *pte &= ~PTE_REFERENCED; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); if (++count >= PMAP_TS_REFERENCED_MAX) { PMAP_UNLOCK(pv->pv_pmap); break; } } } PMAP_UNLOCK(pv->pv_pmap); } rw_wunlock(&pvh_global_lock); return (count); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range must * have the wired attribute set. In contrast, invalid mappings cannot have * the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, so * there is no need to invalidate any TLB entries. */ static void mmu_booke_unwire(mmu_t mmu, pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va; pte_t *pte; PMAP_LOCK(pmap); for (va = sva; va < eva; va += PAGE_SIZE) { if ((pte = pte_find(mmu, pmap, va)) != NULL && PTE_ISVALID(pte)) { if (!PTE_ISWIRED(pte)) panic("mmu_booke_unwire: pte %p isn't wired", pte); *pte &= ~PTE_WIRED; pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Return true if the pmap's pv is one of the first 16 pvs linked to from this * page. This count may be changed upwards or downwards in the future; it is * only necessary that true be returned for a small subset of pmaps for proper * page aging. */ static boolean_t mmu_booke_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m) { pv_entry_t pv; int loops; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_page_exists_quick: page %p is not managed", m)); loops = 0; rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { if (pv->pv_pmap == pmap) { rv = TRUE; break; } if (++loops >= 16) break; } rw_wunlock(&pvh_global_lock); return (rv); } /* * Return the number of managed mappings to the given physical page that are * wired. */ static int mmu_booke_page_wired_mappings(mmu_t mmu, vm_page_t m) { pv_entry_t pv; pte_t *pte; int count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL) if (PTE_ISVALID(pte) && PTE_ISWIRED(pte)) count++; PMAP_UNLOCK(pv->pv_pmap); } rw_wunlock(&pvh_global_lock); return (count); } static int mmu_booke_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { int i; vm_offset_t va; /* * This currently does not work for entries that * overlap TLB1 entries. */ for (i = 0; i < TLB1_ENTRIES; i ++) { if (tlb1_iomapped(i, pa, size, &va) == 0) return (0); } return (EFAULT); } void mmu_booke_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va) { vm_paddr_t ppa; vm_offset_t ofs; vm_size_t gran; /* Minidumps are based on virtual memory addresses. */ if (do_minidump) { *va = (void *)(vm_offset_t)pa; return; } /* Raw physical memory dumps don't have a virtual address. */ /* We always map a 256MB page at 256M. */ gran = 256 * 1024 * 1024; ppa = rounddown2(pa, gran); ofs = pa - ppa; *va = (void *)gran; tlb1_set_entry((vm_offset_t)va, ppa, gran, _TLB_ENTRY_IO); if (sz > (gran - ofs)) tlb1_set_entry((vm_offset_t)(va + gran), ppa + gran, gran, _TLB_ENTRY_IO); } void mmu_booke_dumpsys_unmap(mmu_t mmu, vm_paddr_t pa, size_t sz, void *va) { vm_paddr_t ppa; vm_offset_t ofs; vm_size_t gran; tlb_entry_t e; int i; /* Minidumps are based on virtual memory addresses. */ /* Nothing to do... */ if (do_minidump) return; for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) break; } /* Raw physical memory dumps don't have a virtual address. */ i--; e.mas1 = 0; e.mas2 = 0; e.mas3 = 0; tlb1_write_entry(&e, i); gran = 256 * 1024 * 1024; ppa = rounddown2(pa, gran); ofs = pa - ppa; if (sz > (gran - ofs)) { i--; e.mas1 = 0; e.mas2 = 0; e.mas3 = 0; tlb1_write_entry(&e, i); } } extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1]; void mmu_booke_scan_init(mmu_t mmu) { vm_offset_t va; pte_t *pte; int i; if (!do_minidump) { /* Initialize phys. segments for dumpsys(). */ memset(&dump_map, 0, sizeof(dump_map)); mem_regions(&physmem_regions, &physmem_regions_sz, &availmem_regions, &availmem_regions_sz); for (i = 0; i < physmem_regions_sz; i++) { dump_map[i].pa_start = physmem_regions[i].mr_start; dump_map[i].pa_size = physmem_regions[i].mr_size; } return; } /* Virtual segments for minidumps: */ memset(&dump_map, 0, sizeof(dump_map)); /* 1st: kernel .data and .bss. */ dump_map[0].pa_start = trunc_page((uintptr_t)_etext); dump_map[0].pa_size = round_page((uintptr_t)_end) - dump_map[0].pa_start; /* 2nd: msgbuf and tables (see pmap_bootstrap()). */ dump_map[1].pa_start = data_start; dump_map[1].pa_size = data_end - data_start; /* 3rd: kernel VM. */ va = dump_map[1].pa_start + dump_map[1].pa_size; /* Find start of next chunk (from va). */ while (va < virtual_end) { /* Don't dump the buffer cache. */ if (va >= kmi.buffer_sva && va < kmi.buffer_eva) { va = kmi.buffer_eva; continue; } pte = pte_find(mmu, kernel_pmap, va); if (pte != NULL && PTE_ISVALID(pte)) break; va += PAGE_SIZE; } if (va < virtual_end) { dump_map[2].pa_start = va; va += PAGE_SIZE; /* Find last page in chunk. */ while (va < virtual_end) { /* Don't run into the buffer cache. */ if (va == kmi.buffer_sva) break; pte = pte_find(mmu, kernel_pmap, va); if (pte == NULL || !PTE_ISVALID(pte)) break; va += PAGE_SIZE; } dump_map[2].pa_size = va - dump_map[2].pa_start; } } /* * Map a set of physical memory pages into the kernel virtual address space. * Return a pointer to where it is mapped. This routine is intended to be used * for mapping device memory, NOT real memory. */ static void * mmu_booke_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { return (mmu_booke_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT)); } static int tlb1_find_pa(vm_paddr_t pa, tlb_entry_t *e) { int i; for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(e, i); if ((e->mas1 & MAS1_VALID) == 0) continue; if (e->phys == pa) return (i); } return (-1); } static void * mmu_booke_mapdev_attr(mmu_t mmu, vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { tlb_entry_t e; vm_paddr_t tmppa; #ifndef __powerpc64__ uintptr_t tmpva; #endif uintptr_t va, retva; vm_size_t sz; int i; int wimge; /* * Check if this is premapped in TLB1. */ sz = size; tmppa = pa; va = ~0; wimge = tlb_calc_wimg(pa, ma); for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) continue; if (wimge != (e.mas2 & (MAS2_WIMGE_MASK & ~_TLB_ENTRY_SHARED))) continue; if (tmppa >= e.phys && tmppa < e.phys + e.size) { va = e.virt + (pa - e.phys); tmppa = e.phys + e.size; sz -= MIN(sz, e.size - (pa - e.phys)); while (sz > 0 && (i = tlb1_find_pa(tmppa, &e)) != -1) { if (wimge != (e.mas2 & (MAS2_WIMGE_MASK & ~_TLB_ENTRY_SHARED))) break; sz -= MIN(sz, e.size); tmppa = e.phys + e.size; } if (sz != 0) break; return ((void *)va); } } size = roundup(size, PAGE_SIZE); #ifdef __powerpc64__ KASSERT(pa < VM_MAPDEV_PA_MAX, ("Unsupported physical address! %lx", pa)); va = VM_MAPDEV_BASE + pa; retva = va; #ifdef POW2_MAPPINGS /* * Align the mapping to a power of 2 size, taking into account that we * may need to increase the size multiple times to satisfy the size and * alignment requirements. * * This works in the general case because it's very rare (near never?) * to have different access properties (WIMG) within a single * power-of-two region. If a design does call for that, POW2_MAPPINGS * can be undefined, and exact mappings will be used instead. */ sz = size; size = roundup2(size, 1 << ilog2(size)); while (rounddown2(va, size) + size < va + sz) size <<= 1; va = rounddown2(va, size); pa = rounddown2(pa, size); #endif #else /* * The device mapping area is between VM_MAXUSER_ADDRESS and * VM_MIN_KERNEL_ADDRESS. This gives 1GB of device addressing. */ #ifdef SPARSE_MAPDEV /* * With a sparse mapdev, align to the largest starting region. This * could feasibly be optimized for a 'best-fit' alignment, but that * calculation could be very costly. * Align to the smaller of: * - first set bit in overlap of (pa & size mask) * - largest size envelope * * It's possible the device mapping may start at a PA that's not larger * than the size mask, so we need to offset in to maximize the TLB entry * range and minimize the number of used TLB entries. */ do { tmpva = tlb1_map_base; sz = ffsl((~((1 << flsl(size-1)) - 1)) & pa); sz = sz ? min(roundup(sz + 3, 4), flsl(size) - 1) : flsl(size) - 1; va = roundup(tlb1_map_base, 1 << sz) | (((1 << sz) - 1) & pa); } while (!atomic_cmpset_int(&tlb1_map_base, tmpva, va + size)); #endif va = atomic_fetchadd_int(&tlb1_map_base, size); retva = va; #endif if (tlb1_mapin_region(va, pa, size, tlb_calc_wimg(pa, ma)) != size) return (NULL); return ((void *)retva); } /* * 'Unmap' a range mapped by mmu_booke_mapdev(). */ static void mmu_booke_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size) { #ifdef SUPPORTS_SHRINKING_TLB1 vm_offset_t base, offset; /* * Unmap only if this is inside kernel virtual space. */ if ((va >= VM_MIN_KERNEL_ADDRESS) && (va <= VM_MAX_KERNEL_ADDRESS)) { base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); kva_free(base, size); } #endif } /* * mmu_booke_object_init_pt preloads the ptes for a given object into the * specified pmap. This eliminates the blast of soft faults on process startup * and immediately after an mmap. */ static void mmu_booke_object_init_pt(mmu_t mmu, pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("mmu_booke_object_init_pt: non-device object")); } /* * Perform the pmap work for mincore. */ static int mmu_booke_mincore(mmu_t mmu, pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { /* XXX: this should be implemented at some point */ return (0); } static int mmu_booke_change_attr(mmu_t mmu, vm_offset_t addr, vm_size_t sz, vm_memattr_t mode) { vm_offset_t va; pte_t *pte; int i, j; tlb_entry_t e; addr = trunc_page(addr); /* Only allow changes to mapped kernel addresses. This includes: * - KVA * - DMAP (powerpc64) * - Device mappings */ if (addr <= VM_MAXUSER_ADDRESS || #ifdef __powerpc64__ (addr >= tlb1_map_base && addr < DMAP_BASE_ADDRESS) || (addr > DMAP_MAX_ADDRESS && addr < VM_MIN_KERNEL_ADDRESS) || #else (addr >= tlb1_map_base && addr < VM_MIN_KERNEL_ADDRESS) || #endif (addr > VM_MAX_KERNEL_ADDRESS)) return (EINVAL); /* Check TLB1 mappings */ for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) continue; if (addr >= e.virt && addr < e.virt + e.size) break; } if (i < TLB1_ENTRIES) { /* Only allow full mappings to be modified for now. */ /* Validate the range. */ for (j = i, va = addr; va < addr + sz; va += e.size, j++) { tlb1_read_entry(&e, j); if (va != e.virt || (sz - (va - addr) < e.size)) return (EINVAL); } for (va = addr; va < addr + sz; va += e.size, i++) { tlb1_read_entry(&e, i); e.mas2 &= ~MAS2_WIMGE_MASK; e.mas2 |= tlb_calc_wimg(e.phys, mode); /* * Write it out to the TLB. Should really re-sync with other * cores. */ tlb1_write_entry(&e, i); } return (0); } /* Not in TLB1, try through pmap */ /* First validate the range. */ for (va = addr; va < addr + sz; va += PAGE_SIZE) { pte = pte_find(mmu, kernel_pmap, va); if (pte == NULL || !PTE_ISVALID(pte)) return (EINVAL); } mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); for (va = addr; va < addr + sz; va += PAGE_SIZE) { pte = pte_find(mmu, kernel_pmap, va); *pte &= ~(PTE_MAS2_MASK << PTE_MAS2_SHIFT); *pte |= tlb_calc_wimg(PTE_PA(pte), mode) << PTE_MAS2_SHIFT; tlb0_flush_entry(va); } tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); return (0); } static void mmu_booke_page_array_startup(mmu_t mmu, long pages) { vm_page_array_size = pages; } /**************************************************************************/ /* TID handling */ /**************************************************************************/ /* * Allocate a TID. If necessary, steal one from someone else. * The new TID is flushed from the TLB before returning. */ static tlbtid_t tid_alloc(pmap_t pmap) { tlbtid_t tid; int thiscpu; KASSERT((pmap != kernel_pmap), ("tid_alloc: kernel pmap")); CTR2(KTR_PMAP, "%s: s (pmap = %p)", __func__, pmap); thiscpu = PCPU_GET(cpuid); tid = PCPU_GET(booke.tid_next); if (tid > TID_MAX) tid = TID_MIN; PCPU_SET(booke.tid_next, tid + 1); /* If we are stealing TID then clear the relevant pmap's field */ if (tidbusy[thiscpu][tid] != NULL) { CTR2(KTR_PMAP, "%s: warning: stealing tid %d", __func__, tid); tidbusy[thiscpu][tid]->pm_tid[thiscpu] = TID_NONE; /* Flush all entries from TLB0 matching this TID. */ tid_flush(tid); } tidbusy[thiscpu][tid] = pmap; pmap->pm_tid[thiscpu] = tid; __asm __volatile("msync; isync"); CTR3(KTR_PMAP, "%s: e (%02d next = %02d)", __func__, tid, PCPU_GET(booke.tid_next)); return (tid); } /**************************************************************************/ /* TLB0 handling */ /**************************************************************************/ /* Convert TLB0 va and way number to tlb0[] table index. */ static inline unsigned int tlb0_tableidx(vm_offset_t va, unsigned int way) { unsigned int idx; idx = (way * TLB0_ENTRIES_PER_WAY); idx += (va & MAS2_TLB0_ENTRY_IDX_MASK) >> MAS2_TLB0_ENTRY_IDX_SHIFT; return (idx); } /* * Invalidate TLB0 entry. */ static inline void tlb0_flush_entry(vm_offset_t va) { CTR2(KTR_PMAP, "%s: s va=0x%08x", __func__, va); mtx_assert(&tlbivax_mutex, MA_OWNED); __asm __volatile("tlbivax 0, %0" :: "r"(va & MAS2_EPN_MASK)); __asm __volatile("isync; msync"); __asm __volatile("tlbsync; msync"); CTR1(KTR_PMAP, "%s: e", __func__); } /**************************************************************************/ /* TLB1 handling */ /**************************************************************************/ /* * TLB1 mapping notes: * * TLB1[0] Kernel text and data. * TLB1[1-15] Additional kernel text and data mappings (if required), PCI * windows, other devices mappings. */ /* * Read an entry from given TLB1 slot. */ void tlb1_read_entry(tlb_entry_t *entry, unsigned int slot) { register_t msr; uint32_t mas0; KASSERT((entry != NULL), ("%s(): Entry is NULL!", __func__)); msr = mfmsr(); __asm __volatile("wrteei 0"); mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(slot); mtspr(SPR_MAS0, mas0); __asm __volatile("isync; tlbre"); entry->mas1 = mfspr(SPR_MAS1); entry->mas2 = mfspr(SPR_MAS2); entry->mas3 = mfspr(SPR_MAS3); switch ((mfpvr() >> 16) & 0xFFFF) { case FSL_E500v2: case FSL_E500mc: case FSL_E5500: case FSL_E6500: entry->mas7 = mfspr(SPR_MAS7); break; default: entry->mas7 = 0; break; } __asm __volatile("wrtee %0" :: "r"(msr)); entry->virt = entry->mas2 & MAS2_EPN_MASK; entry->phys = ((vm_paddr_t)(entry->mas7 & MAS7_RPN) << 32) | (entry->mas3 & MAS3_RPN); entry->size = tsize2size((entry->mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT); } struct tlbwrite_args { tlb_entry_t *e; unsigned int idx; }; static uint32_t tlb1_find_free(void) { tlb_entry_t e; int i; for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if ((e.mas1 & MAS1_VALID) == 0) return (i); } return (-1); } static void tlb1_purge_va_range(vm_offset_t va, vm_size_t size) { tlb_entry_t e; int i; for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if ((e.mas1 & MAS1_VALID) == 0) continue; if ((e.mas2 & MAS2_EPN_MASK) >= va && (e.mas2 & MAS2_EPN_MASK) < va + size) { mtspr(SPR_MAS1, e.mas1 & ~MAS1_VALID); __asm __volatile("isync; tlbwe; isync; msync"); } } } static void tlb1_write_entry_int(void *arg) { struct tlbwrite_args *args = arg; uint32_t idx, mas0; idx = args->idx; if (idx == -1) { tlb1_purge_va_range(args->e->virt, args->e->size); idx = tlb1_find_free(); if (idx == -1) panic("No free TLB1 entries!\n"); } /* Select entry */ mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(idx); mtspr(SPR_MAS0, mas0); mtspr(SPR_MAS1, args->e->mas1); mtspr(SPR_MAS2, args->e->mas2); mtspr(SPR_MAS3, args->e->mas3); switch ((mfpvr() >> 16) & 0xFFFF) { case FSL_E500mc: case FSL_E5500: case FSL_E6500: mtspr(SPR_MAS8, 0); /* FALLTHROUGH */ case FSL_E500v2: mtspr(SPR_MAS7, args->e->mas7); break; default: break; } __asm __volatile("isync; tlbwe; isync; msync"); } static void tlb1_write_entry_sync(void *arg) { /* Empty synchronization point for smp_rendezvous(). */ } /* * Write given entry to TLB1 hardware. */ static void tlb1_write_entry(tlb_entry_t *e, unsigned int idx) { struct tlbwrite_args args; args.e = e; args.idx = idx; #ifdef SMP if ((e->mas2 & _TLB_ENTRY_SHARED) && smp_started) { mb(); smp_rendezvous(tlb1_write_entry_sync, tlb1_write_entry_int, tlb1_write_entry_sync, &args); } else #endif { register_t msr; msr = mfmsr(); __asm __volatile("wrteei 0"); tlb1_write_entry_int(&args); __asm __volatile("wrtee %0" :: "r"(msr)); } } /* * Convert TLB TSIZE value to mapped region size. */ static vm_size_t tsize2size(unsigned int tsize) { /* * size = 4^tsize KB * size = 4^tsize * 2^10 = 2^(2 * tsize - 10) */ return ((1 << (2 * tsize)) * 1024); } /* * Convert region size (must be power of 4) to TLB TSIZE value. */ static unsigned int size2tsize(vm_size_t size) { return (ilog2(size) / 2 - 5); } /* * Register permanent kernel mapping in TLB1. * * Entries are created starting from index 0 (current free entry is * kept in tlb1_idx) and are not supposed to be invalidated. */ int tlb1_set_entry(vm_offset_t va, vm_paddr_t pa, vm_size_t size, uint32_t flags) { tlb_entry_t e; uint32_t ts, tid; int tsize, index; /* First try to update an existing entry. */ for (index = 0; index < TLB1_ENTRIES; index++) { tlb1_read_entry(&e, index); /* Check if we're just updating the flags, and update them. */ if (e.phys == pa && e.virt == va && e.size == size) { e.mas2 = (va & MAS2_EPN_MASK) | flags; tlb1_write_entry(&e, index); return (0); } } /* Convert size to TSIZE */ tsize = size2tsize(size); tid = (TID_KERNEL << MAS1_TID_SHIFT) & MAS1_TID_MASK; /* XXX TS is hard coded to 0 for now as we only use single address space */ ts = (0 << MAS1_TS_SHIFT) & MAS1_TS_MASK; e.phys = pa; e.virt = va; e.size = size; e.mas1 = MAS1_VALID | MAS1_IPROT | ts | tid; e.mas1 |= ((tsize << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK); e.mas2 = (va & MAS2_EPN_MASK) | flags; /* Set supervisor RWX permission bits */ e.mas3 = (pa & MAS3_RPN) | MAS3_SR | MAS3_SW | MAS3_SX; e.mas7 = (pa >> 32) & MAS7_RPN; tlb1_write_entry(&e, -1); return (0); } /* * Map in contiguous RAM region into the TLB1. */ static vm_size_t tlb1_mapin_region(vm_offset_t va, vm_paddr_t pa, vm_size_t size, int wimge) { vm_offset_t base; vm_size_t mapped, sz, ssize; mapped = 0; base = va; ssize = size; while (size > 0) { sz = 1UL << (ilog2(size) & ~1); /* Align size to PA */ if (pa % sz != 0) { do { sz >>= 2; } while (pa % sz != 0); } /* Now align from there to VA */ if (va % sz != 0) { do { sz >>= 2; } while (va % sz != 0); } #ifdef __powerpc64__ /* * Clamp TLB1 entries to 4G. * * While the e6500 supports up to 1TB mappings, the e5500 * only supports up to 4G mappings. (0b1011) * * If any e6500 machines capable of supporting a very * large amount of memory appear in the future, we can * revisit this. * * For now, though, since we have plenty of space in TLB1, * always avoid creating entries larger than 4GB. */ sz = MIN(sz, 1UL << 32); #endif if (bootverbose) printf("Wiring VA=%p to PA=%jx (size=%lx)\n", (void *)va, (uintmax_t)pa, (long)sz); if (tlb1_set_entry(va, pa, sz, _TLB_ENTRY_SHARED | wimge) < 0) return (mapped); size -= sz; pa += sz; va += sz; } mapped = (va - base); if (bootverbose) printf("mapped size 0x%"PRIxPTR" (wasted space 0x%"PRIxPTR")\n", mapped, mapped - ssize); return (mapped); } /* * TLB1 initialization routine, to be called after the very first * assembler level setup done in locore.S. */ void tlb1_init() { vm_offset_t mas2; uint32_t mas0, mas1, mas3, mas7; uint32_t tsz; tlb1_get_tlbconf(); mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(0); mtspr(SPR_MAS0, mas0); __asm __volatile("isync; tlbre"); mas1 = mfspr(SPR_MAS1); mas2 = mfspr(SPR_MAS2); mas3 = mfspr(SPR_MAS3); mas7 = mfspr(SPR_MAS7); kernload = ((vm_paddr_t)(mas7 & MAS7_RPN) << 32) | (mas3 & MAS3_RPN); tsz = (mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT; kernsize += (tsz > 0) ? tsize2size(tsz) : 0; kernstart = trunc_page(mas2); /* Setup TLB miss defaults */ set_mas4_defaults(); } /* * pmap_early_io_unmap() should be used in short conjunction with * pmap_early_io_map(), as in the following snippet: * * x = pmap_early_io_map(...); * * pmap_early_io_unmap(x, size); * * And avoiding more allocations between. */ void pmap_early_io_unmap(vm_offset_t va, vm_size_t size) { int i; tlb_entry_t e; vm_size_t isize; size = roundup(size, PAGE_SIZE); isize = size; for (i = 0; i < TLB1_ENTRIES && size > 0; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) continue; if (va <= e.virt && (va + isize) >= (e.virt + e.size)) { size -= e.size; e.mas1 &= ~MAS1_VALID; tlb1_write_entry(&e, i); } } if (tlb1_map_base == va + isize) tlb1_map_base -= isize; } vm_offset_t pmap_early_io_map(vm_paddr_t pa, vm_size_t size) { vm_paddr_t pa_base; vm_offset_t va, sz; int i; tlb_entry_t e; KASSERT(!pmap_bootstrapped, ("Do not use after PMAP is up!")); for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) continue; if (pa >= e.phys && (pa + size) <= (e.phys + e.size)) return (e.virt + (pa - e.phys)); } pa_base = rounddown(pa, PAGE_SIZE); size = roundup(size + (pa - pa_base), PAGE_SIZE); tlb1_map_base = roundup2(tlb1_map_base, 1 << (ilog2(size) & ~1)); va = tlb1_map_base + (pa - pa_base); do { sz = 1 << (ilog2(size) & ~1); tlb1_set_entry(tlb1_map_base, pa_base, sz, _TLB_ENTRY_SHARED | _TLB_ENTRY_IO); size -= sz; pa_base += sz; tlb1_map_base += sz; } while (size > 0); return (va); } void pmap_track_page(pmap_t pmap, vm_offset_t va) { vm_paddr_t pa; vm_page_t page; struct pv_entry *pve; va = trunc_page(va); pa = pmap_kextract(va); page = PHYS_TO_VM_PAGE(pa); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); TAILQ_FOREACH(pve, &page->md.pv_list, pv_link) { if ((pmap == pve->pv_pmap) && (va == pve->pv_va)) { goto out; } } page->md.pv_tracked = true; pv_insert(pmap, va, page); out: PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); } /* * Setup MAS4 defaults. * These values are loaded to MAS0-2 on a TLB miss. */ static void set_mas4_defaults(void) { uint32_t mas4; /* Defaults: TLB0, PID0, TSIZED=4K */ mas4 = MAS4_TLBSELD0; mas4 |= (TLB_SIZE_4K << MAS4_TSIZED_SHIFT) & MAS4_TSIZED_MASK; #ifdef SMP mas4 |= MAS4_MD; #endif mtspr(SPR_MAS4, mas4); __asm __volatile("isync"); } /* * Return 0 if the physical IO range is encompassed by one of the * the TLB1 entries, otherwise return related error code. */ static int tlb1_iomapped(int i, vm_paddr_t pa, vm_size_t size, vm_offset_t *va) { uint32_t prot; vm_paddr_t pa_start; vm_paddr_t pa_end; unsigned int entry_tsize; vm_size_t entry_size; tlb_entry_t e; *va = (vm_offset_t)NULL; tlb1_read_entry(&e, i); /* Skip invalid entries */ if (!(e.mas1 & MAS1_VALID)) return (EINVAL); /* * The entry must be cache-inhibited, guarded, and r/w * so it can function as an i/o page */ prot = e.mas2 & (MAS2_I | MAS2_G); if (prot != (MAS2_I | MAS2_G)) return (EPERM); prot = e.mas3 & (MAS3_SR | MAS3_SW); if (prot != (MAS3_SR | MAS3_SW)) return (EPERM); /* The address should be within the entry range. */ entry_tsize = (e.mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT; KASSERT((entry_tsize), ("tlb1_iomapped: invalid entry tsize")); entry_size = tsize2size(entry_tsize); pa_start = (((vm_paddr_t)e.mas7 & MAS7_RPN) << 32) | (e.mas3 & MAS3_RPN); pa_end = pa_start + entry_size; if ((pa < pa_start) || ((pa + size) > pa_end)) return (ERANGE); /* Return virtual address of this mapping. */ *va = (e.mas2 & MAS2_EPN_MASK) + (pa - pa_start); return (0); } #ifdef DDB /* Print out contents of the MAS registers for each TLB0 entry */ static void #ifdef __powerpc64__ tlb_print_entry(int i, uint32_t mas1, uint64_t mas2, uint32_t mas3, #else tlb_print_entry(int i, uint32_t mas1, uint32_t mas2, uint32_t mas3, #endif uint32_t mas7) { int as; char desc[3]; tlbtid_t tid; vm_size_t size; unsigned int tsize; desc[2] = '\0'; if (mas1 & MAS1_VALID) desc[0] = 'V'; else desc[0] = ' '; if (mas1 & MAS1_IPROT) desc[1] = 'P'; else desc[1] = ' '; as = (mas1 & MAS1_TS_MASK) ? 1 : 0; tid = MAS1_GETTID(mas1); tsize = (mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT; size = 0; if (tsize) size = tsize2size(tsize); printf("%3d: (%s) [AS=%d] " "sz = 0x%jx tsz = %d tid = %d mas1 = 0x%08x " "mas2(va) = 0x%"PRI0ptrX" mas3(pa) = 0x%08x mas7 = 0x%08x\n", i, desc, as, (uintmax_t)size, tsize, tid, mas1, mas2, mas3, mas7); } DB_SHOW_COMMAND(tlb0, tlb0_print_tlbentries) { uint32_t mas0, mas1, mas3, mas7; #ifdef __powerpc64__ uint64_t mas2; #else uint32_t mas2; #endif int entryidx, way, idx; printf("TLB0 entries:\n"); for (way = 0; way < TLB0_WAYS; way ++) for (entryidx = 0; entryidx < TLB0_ENTRIES_PER_WAY; entryidx++) { mas0 = MAS0_TLBSEL(0) | MAS0_ESEL(way); mtspr(SPR_MAS0, mas0); mas2 = entryidx << MAS2_TLB0_ENTRY_IDX_SHIFT; mtspr(SPR_MAS2, mas2); __asm __volatile("isync; tlbre"); mas1 = mfspr(SPR_MAS1); mas2 = mfspr(SPR_MAS2); mas3 = mfspr(SPR_MAS3); mas7 = mfspr(SPR_MAS7); idx = tlb0_tableidx(mas2, way); tlb_print_entry(idx, mas1, mas2, mas3, mas7); } } /* * Print out contents of the MAS registers for each TLB1 entry */ DB_SHOW_COMMAND(tlb1, tlb1_print_tlbentries) { uint32_t mas0, mas1, mas3, mas7; #ifdef __powerpc64__ uint64_t mas2; #else uint32_t mas2; #endif int i; printf("TLB1 entries:\n"); for (i = 0; i < TLB1_ENTRIES; i++) { mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(i); mtspr(SPR_MAS0, mas0); __asm __volatile("isync; tlbre"); mas1 = mfspr(SPR_MAS1); mas2 = mfspr(SPR_MAS2); mas3 = mfspr(SPR_MAS3); mas7 = mfspr(SPR_MAS7); tlb_print_entry(i, mas1, mas2, mas3, mas7); } } #endif Index: head/sys/powerpc/include/cpufunc.h =================================================================== --- head/sys/powerpc/include/cpufunc.h (revision 360886) +++ head/sys/powerpc/include/cpufunc.h (revision 360887) @@ -1,268 +1,296 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1998 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_CPUFUNC_H_ #define _MACHINE_CPUFUNC_H_ #ifdef _KERNEL #include #include #include struct thread; #ifdef KDB void breakpoint(void); #else static __inline void breakpoint(void) { return; } #endif /* CPU register mangling inlines */ static __inline void mtmsr(register_t value) { __asm __volatile ("mtmsr %0; isync" :: "r"(value)); } #ifdef __powerpc64__ static __inline void mtmsrd(register_t value) { __asm __volatile ("mtmsrd %0; isync" :: "r"(value)); } #endif static __inline register_t mfmsr(void) { register_t value; __asm __volatile ("mfmsr %0" : "=r"(value)); return (value); } #ifndef __powerpc64__ static __inline void mtsrin(vm_offset_t va, register_t value) { __asm __volatile ("mtsrin %0,%1; isync" :: "r"(value), "r"(va)); } static __inline register_t mfsrin(vm_offset_t va) { register_t value; __asm __volatile ("mfsrin %0,%1" : "=r"(value) : "r"(va)); return (value); } #endif static __inline register_t mfctrl(void) { register_t value; __asm __volatile ("mfspr %0,136" : "=r"(value)); return (value); } static __inline void mtdec(register_t value) { __asm __volatile ("mtdec %0" :: "r"(value)); } static __inline register_t mfdec(void) { register_t value; __asm __volatile ("mfdec %0" : "=r"(value)); return (value); } static __inline register_t mfpvr(void) { register_t value; __asm __volatile ("mfpvr %0" : "=r"(value)); return (value); } static __inline u_quad_t mftb(void) { u_quad_t tb; #ifdef __powerpc64__ __asm __volatile ("mftb %0" : "=r"(tb)); #else uint32_t *tbup = (uint32_t *)&tb; uint32_t *tblp = tbup + 1; do { *tbup = mfspr(TBR_TBU); *tblp = mfspr(TBR_TBL); } while (*tbup != mfspr(TBR_TBU)); #endif return (tb); } static __inline void mttb(u_quad_t time) { mtspr(TBR_TBWL, 0); mtspr(TBR_TBWU, (uint32_t)(time >> 32)); mtspr(TBR_TBWL, (uint32_t)(time & 0xffffffff)); } static __inline void eieio(void) { __asm __volatile ("eieio" : : : "memory"); } static __inline void isync(void) { __asm __volatile ("isync" : : : "memory"); } static __inline void powerpc_sync(void) { __asm __volatile ("sync" : : : "memory"); } +static __inline int +cntlzd(uint64_t word) +{ + uint64_t result; + /* cntlzd %0, %1 */ + __asm __volatile(".long 0x7c000074 | (%1 << 21) | (%0 << 16)" : + "=r"(result) : "r"(word)); + + return (int)result; +} + +static __inline int +cnttzd(uint64_t word) +{ + uint64_t result; + /* cnttzd %0, %1 */ + __asm __volatile(".long 0x7c000474 | (%1 << 21) | (%0 << 16)" : + "=r"(result) : "r"(word)); + + return (int)result; +} + +static __inline void +ptesync(void) +{ + __asm __volatile("ptesync"); +} + static __inline register_t intr_disable(void) { register_t msr; msr = mfmsr(); mtmsr(msr & ~PSL_EE); return (msr); } static __inline void intr_restore(register_t msr) { mtmsr(msr); } static __inline struct pcpu * get_pcpu(void) { struct pcpu *ret; __asm __volatile("mfsprg %0, 0" : "=r"(ret)); return (ret); } #define HAVE_INLINE_FLS static __inline __pure2 int fls(int mask) { return (mask ? 32 - __builtin_clz(mask) : 0); } #define HAVE_INLINE_FLSL static __inline __pure2 int flsl(long mask) { return (mask ? (8 * sizeof(long) - __builtin_clzl(mask)) : 0); } /* "NOP" operations to signify priorities to the kernel. */ static __inline void nop_prio_vlow(void) { __asm __volatile("or 31,31,31"); } static __inline void nop_prio_low(void) { __asm __volatile("or 1,1,1"); } static __inline void nop_prio_mlow(void) { __asm __volatile("or 6,6,6"); } static __inline void nop_prio_medium(void) { __asm __volatile("or 2,2,2"); } static __inline void nop_prio_mhigh(void) { __asm __volatile("or 5,5,5"); } static __inline void nop_prio_high(void) { __asm __volatile("or 3,3,3"); } #endif /* _KERNEL */ #endif /* !_MACHINE_CPUFUNC_H_ */ Index: head/sys/powerpc/include/mmuvar.h =================================================================== --- head/sys/powerpc/include/mmuvar.h (revision 360886) +++ head/sys/powerpc/include/mmuvar.h (revision 360887) @@ -1,121 +1,122 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Peter Grehan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_MMUVAR_H_ #define _MACHINE_MMUVAR_H_ /* * A PowerPC MMU implementation is declared with a kernel object and * an associated method table. The MMU_DEF macro is used to declare * the class, and also links it to the global MMU class list. * * e.g. * * static mmu_method_t ppc8xx_methods[] = { * MMUMETHOD(mmu_change_wiring, ppc8xx_mmu_change_wiring), * MMUMETHOD(mmu_clear_modify, ppc8xx_mmu_clear_modify), * MMUMETHOD(mmu_clear_reference, ppc8xx_mmu_clear_reference), * ... * MMUMETHOD(mmu_dev_direct_mapped, ppc8xx_mmu_dev_direct_mapped), * { 0, 0 } * }; * * MMU_DEF(ppc8xx, MMU_TYPE_8xx, ppc8xx_methods, sizeof(ppc8xx_mmu_softc)); * * A single level of inheritance is supported in a similar fashion to * kobj inheritance e.g. * * MMU_DEF_1(ppc860c, MMU_TYPE_860c, ppc860c_methods, 0, ppc8xx); */ #include struct mmu_kobj { /* * An MMU instance is a kernel object */ KOBJ_FIELDS; /* * Utility elements that an instance may use */ struct mtx mmu_mtx; /* available for instance use */ void *mmu_iptr; /* instance data pointer */ /* * Opaque data that can be overlaid with an instance-private * structure. MMU code can test that this is large enough at * compile time with a sizeof() test againt it's softc. There * is also a run-time test when the MMU kernel object is * registered. */ #define MMU_OPAQUESZ 64 u_int mmu_opaque[MMU_OPAQUESZ]; }; typedef struct mmu_kobj *mmu_t; typedef struct kobj_class mmu_def_t; #define mmu_method_t kobj_method_t #define MMUMETHOD KOBJMETHOD #define MMU_DEF(name, ident, methods, size) \ \ mmu_def_t name = { \ ident, methods, size, NULL \ }; \ DATA_SET(mmu_set, name) #define MMU_DEF_INHERIT(name, ident, methods, size, base1) \ \ static kobj_class_t name ## _baseclasses[] = \ { &base1, NULL }; \ mmu_def_t name = { \ ident, methods, size, name ## _baseclasses \ }; \ DATA_SET(mmu_set, name) #if 0 mmu_def_t name = { \ ident, methods, size, name ## _baseclasses \ }; DATA_SET(mmu_set, name) #endif /* * Known MMU names */ #define MMU_TYPE_BOOKE "mmu_booke" /* Book-E MMU specification */ #define MMU_TYPE_OEA "mmu_oea" /* 32-bit OEA */ #define MMU_TYPE_G5 "mmu_g5" /* 64-bit bridge (ibm 970) */ +#define MMU_TYPE_RADIX "mmu_radix" /* 64-bit native ISA 3.0 (POWER9) radix */ #define MMU_TYPE_8xx "mmu_8xx" /* 8xx quicc TLB */ #endif /* _MACHINE_MMUVAR_H_ */ Index: head/sys/powerpc/include/param.h =================================================================== --- head/sys/powerpc/include/param.h (revision 360886) +++ head/sys/powerpc/include/param.h (revision 360887) @@ -1,140 +1,156 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2001 David E. O'Brien * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)param.h 5.8 (Berkeley) 6/28/91 * $FreeBSD$ */ #ifndef _POWERPC_INCLUDE_PARAM_H_ #define _POWERPC_INCLUDE_PARAM_H_ /* * Machine dependent constants for PowerPC */ #include /* Needed to display interrupts on OFW PCI */ #define __PCI_REROUTE_INTERRUPT #ifndef MACHINE #define MACHINE "powerpc" #endif #ifndef MACHINE_ARCH #ifdef __powerpc64__ #define MACHINE_ARCH "powerpc64" #else #ifdef __SPE__ #define MACHINE_ARCH "powerpcspe" #else #define MACHINE_ARCH "powerpc" #endif #endif #endif #define MID_MACHINE MID_POWERPC #ifdef __powerpc64__ #ifndef MACHINE_ARCH32 #define MACHINE_ARCH32 "powerpc" #endif #endif #ifdef SMP #ifndef MAXCPU #define MAXCPU 256 #endif #else #define MAXCPU 1 #endif #ifndef MAXMEMDOM #define MAXMEMDOM 8 #endif #define ALIGNBYTES _ALIGNBYTES #define ALIGN(p) _ALIGN(p) /* * ALIGNED_POINTER is a boolean macro that checks whether an address * is valid to fetch data elements of type t from on this architecture. * This does not reflect the optimal alignment, just the possibility * (within reasonable limits). */ #define ALIGNED_POINTER(p, t) ((((uintptr_t)(p)) & (sizeof (t) - 1)) == 0) /* * CACHE_LINE_SIZE is the compile-time maximum cache line size for an * architecture. It should be used with appropriate caution. */ #define CACHE_LINE_SHIFT 7 #define CACHE_LINE_SIZE (1 << CACHE_LINE_SHIFT) #define PAGE_SHIFT 12 #define PAGE_SIZE (1 << PAGE_SHIFT) /* Page size */ #define PAGE_MASK (PAGE_SIZE - 1) #define NPTEPG (PAGE_SIZE/(sizeof (pt_entry_t))) +#define NPDEPG (PAGE_SIZE/(sizeof (pt_entry_t))) -#define MAXPAGESIZES 1 /* maximum number of supported page sizes */ +#define L1_PAGE_SIZE_SHIFT 39 +#define L1_PAGE_SIZE (1UL<> PAGE_SHIFT) #define ptoa(x) ((x) << PAGE_SHIFT) #define powerpc_btop(x) ((x) >> PAGE_SHIFT) #define powerpc_ptob(x) ((x) << PAGE_SHIFT) #define pgtok(x) ((x) * (PAGE_SIZE / 1024UL)) #define btoc(x) ((vm_offset_t)(((x)+PAGE_MASK)>>PAGE_SHIFT)) #endif /* !_POWERPC_INCLUDE_PARAM_H_ */ Index: head/sys/powerpc/include/pmap.h =================================================================== --- head/sys/powerpc/include/pmap.h (revision 360886) +++ head/sys/powerpc/include/pmap.h (revision 360887) @@ -1,298 +1,354 @@ /*- * SPDX-License-Identifier: BSD-3-Clause AND BSD-4-Clause * * Copyright (C) 2006 Semihalf, Marian Balakowicz * All rights reserved. * * Adapted for Freescale's e500 core CPUs. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /*- * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: $NetBSD: pmap.h,v 1.17 2000/03/30 16:18:24 jdolecek Exp $ */ #ifndef _MACHINE_PMAP_H_ #define _MACHINE_PMAP_H_ #include #include #include #include #include #include #include #include #include #include +#ifdef __powerpc64__ +#include +#endif + +/* + * The radix page table structure is described by levels 1-4. + * See Fig 33. on p. 1002 of Power ISA v3.0B + * + * Page directories and tables must be size aligned. + */ + +/* Root page directory - 64k -- each entry covers 512GB */ +typedef uint64_t pml1_entry_t; +/* l2 page directory - 4k -- each entry covers 1GB */ +typedef uint64_t pml2_entry_t; +/* l3 page directory - 4k -- each entry covers 2MB */ +typedef uint64_t pml3_entry_t; +/* l4 page directory - 256B/4k -- each entry covers 64k/4k */ +typedef uint64_t pml4_entry_t; + +typedef uint64_t pt_entry_t; + struct pmap; typedef struct pmap *pmap_t; #define PMAP_ENTER_QUICK_LOCKED 0x10000000 #if !defined(NPMAPS) #define NPMAPS 32768 #endif /* !defined(NPMAPS) */ struct slbtnode; struct pvo_entry { LIST_ENTRY(pvo_entry) pvo_vlink; /* Link to common virt page */ #ifndef __powerpc64__ LIST_ENTRY(pvo_entry) pvo_olink; /* Link to overflow entry */ #endif union { RB_ENTRY(pvo_entry) pvo_plink; /* Link to pmap entries */ SLIST_ENTRY(pvo_entry) pvo_dlink; /* Link to delete enty */ }; struct { #ifndef __powerpc64__ /* 32-bit fields */ pte_t pte; #endif /* 64-bit fields */ uintptr_t slot; vm_paddr_t pa; vm_prot_t prot; } pvo_pte; pmap_t pvo_pmap; /* Owning pmap */ vm_offset_t pvo_vaddr; /* VA of entry */ uint64_t pvo_vpn; /* Virtual page number */ }; LIST_HEAD(pvo_head, pvo_entry); SLIST_HEAD(pvo_dlist, pvo_entry); RB_HEAD(pvo_tree, pvo_entry); int pvo_vaddr_compare(struct pvo_entry *, struct pvo_entry *); RB_PROTOTYPE(pvo_tree, pvo_entry, pvo_plink, pvo_vaddr_compare); /* Used by 32-bit PMAP */ #define PVO_PTEGIDX_MASK 0x007UL /* which PTEG slot */ #define PVO_PTEGIDX_VALID 0x008UL /* slot is valid */ /* Used by 64-bit PMAP */ #define PVO_HID 0x008UL /* PVO entry in alternate hash*/ /* Used by both */ #define PVO_WIRED 0x010UL /* PVO entry is wired */ #define PVO_MANAGED 0x020UL /* PVO entry is managed */ #define PVO_BOOTSTRAP 0x080UL /* PVO entry allocated during bootstrap */ #define PVO_DEAD 0x100UL /* waiting to be deleted */ #define PVO_LARGE 0x200UL /* large page */ #define PVO_VADDR(pvo) ((pvo)->pvo_vaddr & ~ADDR_POFF) #define PVO_PTEGIDX_GET(pvo) ((pvo)->pvo_vaddr & PVO_PTEGIDX_MASK) #define PVO_PTEGIDX_ISSET(pvo) ((pvo)->pvo_vaddr & PVO_PTEGIDX_VALID) #define PVO_PTEGIDX_CLR(pvo) \ ((void)((pvo)->pvo_vaddr &= ~(PVO_PTEGIDX_VALID|PVO_PTEGIDX_MASK))) #define PVO_PTEGIDX_SET(pvo, i) \ ((void)((pvo)->pvo_vaddr |= (i)|PVO_PTEGIDX_VALID)) #define PVO_VSID(pvo) ((pvo)->pvo_vpn >> 16) struct pmap { struct pmap_statistics pm_stats; struct mtx pm_mtx; cpuset_t pm_active; union { struct { - #ifdef __powerpc64__ struct slbtnode *pm_slb_tree_root; struct slb **pm_slb; int pm_slb_len; #else register_t pm_sr[16]; #endif struct pmap *pmap_phys; struct pvo_tree pmap_pvo; }; +#ifdef __powerpc64__ + /* Radix support */ struct { + pml1_entry_t *pm_pml1; /* KVA of root page directory */ + struct vm_radix pm_radix; /* spare page table pages */ + TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ + uint64_t pm_pid; /* PIDR value */ + int pm_flags; + }; +#endif + struct { /* TID to identify this pmap entries in TLB */ - tlbtid_t pm_tid[MAXCPU]; + tlbtid_t pm_tid[MAXCPU]; #ifdef __powerpc64__ /* * Page table directory, * array of pointers to page directories. */ pte_t ****pm_root; #else /* * Page table directory, * array of pointers to page tables. */ pte_t **pm_pdir; /* List of allocated ptbl bufs (ptbl kva regions). */ TAILQ_HEAD(, ptbl_buf) pm_ptbl_list; #endif }; - }; + } __aligned(CACHE_LINE_SIZE); }; +/* + * pv_entries are allocated in chunks per-process. This avoids the + * need to track per-pmap assignments. + */ +#define _NPCM 2 +#define _NPCPV 126 +#define PV_CHUNK_HEADER \ + pmap_t pc_pmap; \ + TAILQ_ENTRY(pv_chunk) pc_list; \ + uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */ \ + TAILQ_ENTRY(pv_chunk) pc_lru; + struct pv_entry { pmap_t pv_pmap; vm_offset_t pv_va; TAILQ_ENTRY(pv_entry) pv_link; }; typedef struct pv_entry *pv_entry_t; +struct pv_chunk_header { + PV_CHUNK_HEADER +}; +struct pv_chunk { + PV_CHUNK_HEADER + uint64_t reserved; + struct pv_entry pc_pventry[_NPCPV]; +}; + struct md_page { union { struct { volatile int32_t mdpg_attrs; vm_memattr_t mdpg_cache_attrs; struct pvo_head mdpg_pvoh; + int pv_gen; /* (p) */ }; struct { - TAILQ_HEAD(, pv_entry) pv_list; int pv_tracked; }; }; + TAILQ_HEAD(, pv_entry) pv_list; /* (p) */ }; #ifdef AIM #define pmap_page_get_memattr(m) ((m)->md.mdpg_cache_attrs) -#define pmap_page_is_mapped(m) (!LIST_EMPTY(&(m)->md.mdpg_pvoh)) #else #define pmap_page_get_memattr(m) VM_MEMATTR_DEFAULT -#define pmap_page_is_mapped(m) (!TAILQ_EMPTY(&(m)->md.pv_list)) -#endif +#endif /* AIM */ /* * Return the VSID corresponding to a given virtual address. * If no VSID is currently defined, it will allocate one, and add * it to a free slot if available. * * NB: The PMAP MUST be locked already. */ uint64_t va_to_vsid(pmap_t pm, vm_offset_t va); /* Lock-free, non-allocating lookup routines */ uint64_t kernel_va_to_slbv(vm_offset_t va); struct slb *user_va_to_slb_entry(pmap_t pm, vm_offset_t va); uint64_t allocate_user_vsid(pmap_t pm, uint64_t esid, int large); void free_vsid(pmap_t pm, uint64_t esid, int large); void slb_insert_user(pmap_t pm, struct slb *slb); void slb_insert_kernel(uint64_t slbe, uint64_t slbv); struct slbtnode *slb_alloc_tree(void); void slb_free_tree(pmap_t pm); struct slb **slb_alloc_user_cache(void); void slb_free_user_cache(struct slb **); extern struct pmap kernel_pmap_store; #define kernel_pmap (&kernel_pmap_store) #ifdef _KERNEL #define PMAP_LOCK(pmap) mtx_lock(&(pmap)->pm_mtx) #define PMAP_LOCK_ASSERT(pmap, type) \ mtx_assert(&(pmap)->pm_mtx, (type)) #define PMAP_LOCK_DESTROY(pmap) mtx_destroy(&(pmap)->pm_mtx) #define PMAP_LOCK_INIT(pmap) mtx_init(&(pmap)->pm_mtx, \ (pmap == kernel_pmap) ? "kernelpmap" : \ - "pmap", NULL, MTX_DEF) + "pmap", NULL, MTX_DEF | MTX_DUPOK) #define PMAP_LOCKED(pmap) mtx_owned(&(pmap)->pm_mtx) #define PMAP_MTX(pmap) (&(pmap)->pm_mtx) #define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx) #define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx) #define pmap_page_is_write_mapped(m) (((m)->a.flags & PGA_WRITEABLE) != 0) void pmap_bootstrap(vm_offset_t, vm_offset_t); void pmap_kenter(vm_offset_t va, vm_paddr_t pa); void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t); void pmap_kremove(vm_offset_t); void *pmap_mapdev(vm_paddr_t, vm_size_t); void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t); void pmap_unmapdev(vm_offset_t, vm_size_t); void pmap_page_set_memattr(vm_page_t, vm_memattr_t); int pmap_change_attr(vm_offset_t, vm_size_t, vm_memattr_t); int pmap_map_user_ptr(pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen); int pmap_decode_kernel_ptr(vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr); void pmap_deactivate(struct thread *); vm_paddr_t pmap_kextract(vm_offset_t); int pmap_dev_direct_mapped(vm_paddr_t, vm_size_t); boolean_t pmap_mmu_install(char *name, int prio); const char *pmap_mmu_name(void); +bool pmap_ps_enabled(pmap_t pmap); +int pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags); +boolean_t pmap_page_is_mapped(vm_page_t m); void pmap_page_array_startup(long count); #define vtophys(va) pmap_kextract((vm_offset_t)(va)) extern vm_offset_t virtual_avail; extern vm_offset_t virtual_end; extern caddr_t crashdumpmap; extern vm_offset_t msgbuf_phys; extern int pmap_bootstrapped; +extern int radix_mmu; vm_offset_t pmap_early_io_map(vm_paddr_t pa, vm_size_t size); void pmap_early_io_unmap(vm_offset_t va, vm_size_t size); void pmap_track_page(pmap_t pmap, vm_offset_t va); +void pmap_page_print_mappings(vm_page_t m); static inline int pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused) { return (0); } #endif #endif /* !_MACHINE_PMAP_H_ */ Index: head/sys/powerpc/include/proc.h =================================================================== --- head/sys/powerpc/include/proc.h (revision 360886) +++ head/sys/powerpc/include/proc.h (revision 360887) @@ -1,83 +1,89 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: proc.h,v 1.2 1997/04/16 22:57:48 thorpej Exp $ * $FreeBSD$ */ #ifndef _MACHINE_PROC_H_ #define _MACHINE_PROC_H_ +struct pmap_invl_gen { + u_long gen; /* (k) */ + LIST_ENTRY(pmap_invl_gen) link; /* (pp) */ +}; + /* * Machine-dependent part of the proc structure */ struct mdthread { int md_spinlock_count; /* (k) */ register_t md_saved_msr; /* (k) */ + struct pmap_invl_gen md_invl_gen; }; struct mdproc { /* * Avoid empty structs because they are undefined behavior. */ long md_spare; }; #ifdef __powerpc64__ #define KINFO_PROC_SIZE 1088 #define KINFO_PROC32_SIZE 816 #else #define KINFO_PROC_SIZE 816 #endif struct syscall_args { u_int code; struct sysent *callp; register_t args[10]; int narg; }; #ifdef _KERNEL #include /* Get the current kernel thread stack usage. */ #define GET_STACK_USAGE(total, used) do { \ struct thread *td = curthread; \ (total) = td->td_kstack_pages * PAGE_SIZE - sizeof(struct pcb); \ (used) = (char *)td->td_kstack + \ td->td_kstack_pages * PAGE_SIZE - \ (char *)&td; \ } while (0) #endif #endif /* !_MACHINE_PROC_H_ */ Index: head/sys/powerpc/include/pte.h =================================================================== --- head/sys/powerpc/include/pte.h (revision 360886) +++ head/sys/powerpc/include/pte.h (revision 360887) @@ -1,409 +1,421 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: pte.h,v 1.2 1998/08/31 14:43:40 tsubai Exp $ * $FreeBSD$ */ #ifndef _MACHINE_PTE_H_ #define _MACHINE_PTE_H_ #if defined(AIM) /* * Page Table Entries */ #ifndef LOCORE /* 32-bit PTE */ struct pte { u_int32_t pte_hi; u_int32_t pte_lo; }; struct pteg { struct pte pt[8]; }; /* 64-bit (long) PTE */ struct lpte { u_int64_t pte_hi; u_int64_t pte_lo; }; struct lpteg { struct lpte pt[8]; }; /* Partition table entry */ struct pate { u_int64_t pagetab; u_int64_t proctab; }; +/* Process table entry */ +struct prte { + u_int64_t proctab0; + u_int64_t proctab1; +}; + typedef struct pte pte_t; typedef struct lpte lpte_t; #endif /* LOCORE */ /* 32-bit PTE definitions */ /* High word: */ #define PTE_VALID 0x80000000 #define PTE_VSID_SHFT 7 #define PTE_HID 0x00000040 #define PTE_API 0x0000003f /* Low word: */ #define PTE_RPGN 0xfffff000 #define PTE_REF 0x00000100 #define PTE_CHG 0x00000080 #define PTE_WIMG 0x00000078 #define PTE_W 0x00000040 #define PTE_I 0x00000020 #define PTE_M 0x00000010 #define PTE_G 0x00000008 #define PTE_PP 0x00000003 #define PTE_SO 0x00000000 /* Super. Only (U: XX, S: RW) */ #define PTE_SW 0x00000001 /* Super. Write-Only (U: RO, S: RW) */ #define PTE_BW 0x00000002 /* Supervisor (U: RW, S: RW) */ #define PTE_BR 0x00000003 /* Both Read Only (U: RO, S: RO) */ #define PTE_RW PTE_BW #define PTE_RO PTE_BR #define PTE_EXEC 0x00000200 /* pseudo bit in attrs; page is exec */ /* 64-bit PTE definitions */ /* High quadword: */ #define LPTE_VSID_SHIFT 12 #define LPTE_AVPN_MASK 0xFFFFFFFFFFFFFF80ULL #define LPTE_API 0x0000000000000F80ULL #define LPTE_SWBITS 0x0000000000000078ULL #define LPTE_WIRED 0x0000000000000010ULL #define LPTE_LOCKED 0x0000000000000008ULL #define LPTE_BIG 0x0000000000000004ULL /* 4kb/16Mb page */ #define LPTE_HID 0x0000000000000002ULL #define LPTE_VALID 0x0000000000000001ULL /* Low quadword: */ #define EXTEND_PTE(x) UINT64_C(x) /* make constants 64-bit */ #define LPTE_RPGN 0xfffffffffffff000ULL #define LPTE_REF EXTEND_PTE( PTE_REF ) #define LPTE_CHG EXTEND_PTE( PTE_CHG ) #define LPTE_WIMG EXTEND_PTE( PTE_WIMG ) #define LPTE_W EXTEND_PTE( PTE_W ) #define LPTE_I EXTEND_PTE( PTE_I ) #define LPTE_M EXTEND_PTE( PTE_M ) #define LPTE_G EXTEND_PTE( PTE_G ) #define LPTE_NOEXEC 0x0000000000000004ULL #define LPTE_PP EXTEND_PTE( PTE_PP ) #define LPTE_SO EXTEND_PTE( PTE_SO ) /* Super. Only */ #define LPTE_SW EXTEND_PTE( PTE_SW ) /* Super. Write-Only */ #define LPTE_BW EXTEND_PTE( PTE_BW ) /* Supervisor */ #define LPTE_BR EXTEND_PTE( PTE_BR ) /* Both Read Only */ #define LPTE_RW LPTE_BW #define LPTE_RO LPTE_BR /* POWER ISA 3.0 Radix Table Definitions */ #define RPTE_VALID 0x8000000000000000ULL #define RPTE_LEAF 0x4000000000000000ULL /* is a PTE: always 1 */ #define RPTE_SW0 0x2000000000000000ULL #define RPTE_RPN_MASK 0x00FFFFFFFFFFF000ULL #define RPTE_RPN_SHIFT 12 #define RPTE_SW1 0x0000000000000800ULL #define RPTE_SW2 0x0000000000000400ULL #define RPTE_SW3 0x0000000000000200ULL #define RPTE_R 0x0000000000000100ULL #define RPTE_C 0x0000000000000080ULL +#define RPTE_MANAGED RPTE_SW1 +#define RPTE_WIRED RPTE_SW2 +#define RPTE_PROMOTED RPTE_SW3 + #define RPTE_ATTR_MASK 0x0000000000000030ULL #define RPTE_ATTR_MEM 0x0000000000000000ULL /* PTE M */ #define RPTE_ATTR_SAO 0x0000000000000010ULL /* PTE WIM */ #define RPTE_ATTR_GUARDEDIO 0x0000000000000020ULL /* PTE IMG */ #define RPTE_ATTR_UNGUARDEDIO 0x0000000000000030ULL /* PTE IM */ #define RPTE_EAA_MASK 0x000000000000000FULL #define RPTE_EAA_P 0x0000000000000008ULL /* Supervisor only */ #define RPTE_EAA_R 0x0000000000000004ULL /* Read allowed */ #define RPTE_EAA_W 0x0000000000000002ULL /* Write (+read) */ #define RPTE_EAA_X 0x0000000000000001ULL /* Execute allowed */ #define RPDE_VALID RPTE_VALID #define RPDE_LEAF RPTE_LEAF /* is a PTE: always 0 */ -#define RPDE_NLB_MASK 0x0FFFFFFFFFFFFF00ULL +#define RPDE_NLB_MASK 0x00FFFFFFFFFFFF00ULL #define RPDE_NLB_SHIFT 8 #define RPDE_NLS_MASK 0x000000000000001FULL +#define PG_FRAME (0x000ffffffffff000ul) +#define PG_PS_FRAME (0x000fffffffe00000ul) /* * Extract bits from address */ #define ADDR_SR_SHFT 28 #define ADDR_PIDX 0x0ffff000UL #define ADDR_PIDX_SHFT 12 #define ADDR_API_SHFT 22 #define ADDR_API_SHFT64 16 #define ADDR_POFF 0x00000fffUL /* * Bits in DSISR: */ #define DSISR_DIRECT 0x80000000 #define DSISR_NOTFOUND 0x40000000 #define DSISR_PROTECT 0x08000000 #define DSISR_INVRX 0x04000000 #define DSISR_STORE 0x02000000 #define DSISR_DABR 0x00400000 #define DSISR_SEGMENT 0x00200000 #define DSISR_EAR 0x00100000 /* * Bits in SRR1 on ISI: */ #define ISSRR1_NOTFOUND 0x40000000 #define ISSRR1_DIRECT 0x10000000 #define ISSRR1_PROTECT 0x08000000 #define ISSRR1_SEGMENT 0x00200000 #else /* BOOKE */ #include /* * Flags for pte_remove() routine. */ #define PTBL_HOLD 0x00000001 /* do not unhold ptbl pages */ #define PTBL_UNHOLD 0x00000002 /* unhold and attempt to free ptbl pages */ #define PTBL_HOLD_FLAG(pmap) (((pmap) == kernel_pmap) ? PTBL_HOLD : PTBL_UNHOLD) /* * Page Table Entry definitions and macros. * * RPN need only be 32-bit because Book-E has 36-bit addresses, and the smallest * page size is 4k (12-bit mask), so RPN can really fit into 24 bits. */ #ifndef LOCORE typedef uint64_t pte_t; #endif /* RPN mask, TLB0 4K pages */ #define PTE_PA_MASK PAGE_MASK #if defined(BOOKE_E500) /* PTE bits assigned to MAS2, MAS3 flags */ #define PTE_MAS2_SHIFT 19 #define PTE_W (MAS2_W << PTE_MAS2_SHIFT) #define PTE_I (MAS2_I << PTE_MAS2_SHIFT) #define PTE_M (MAS2_M << PTE_MAS2_SHIFT) #define PTE_G (MAS2_G << PTE_MAS2_SHIFT) #define PTE_MAS2_MASK (MAS2_G | MAS2_M | MAS2_I | MAS2_W) #define PTE_MAS3_SHIFT 2 #define PTE_UX (MAS3_UX << PTE_MAS3_SHIFT) #define PTE_SX (MAS3_SX << PTE_MAS3_SHIFT) #define PTE_UW (MAS3_UW << PTE_MAS3_SHIFT) #define PTE_SW (MAS3_SW << PTE_MAS3_SHIFT) #define PTE_UR (MAS3_UR << PTE_MAS3_SHIFT) #define PTE_SR (MAS3_SR << PTE_MAS3_SHIFT) #define PTE_MAS3_MASK ((MAS3_UX | MAS3_SX | MAS3_UW \ | MAS3_SW | MAS3_UR | MAS3_SR) << PTE_MAS3_SHIFT) #define PTE_PS_SHIFT 8 #define PTE_PS_4KB (2 << PTE_PS_SHIFT) #endif /* Other PTE flags */ #define PTE_VALID 0x00000001 /* Valid */ #define PTE_MODIFIED 0x00001000 /* Modified */ #define PTE_WIRED 0x00002000 /* Wired */ #define PTE_MANAGED 0x00000002 /* Managed */ #define PTE_REFERENCED 0x00040000 /* Referenced */ /* * Page Table Entry definitions and macros. * * We use the hardware page table entry format: * * 63 24 23 19 18 17 14 13 12 11 8 7 6 5 4 3 2 1 0 * --------------------------------------------------------------- * ARPN(12:51) WIMGE R U0:U3 SW0 C PSIZE UX SX UW SW UR SR SW1 V * --------------------------------------------------------------- */ /* PTE fields. */ #define PTE_TSIZE_SHIFT (63-54) #define PTE_TSIZE_MASK 0x7 #define PTE_TSIZE_SHIFT_DIRECT (63-55) #define PTE_TSIZE_MASK_DIRECT 0xf #define PTE_PS_DIRECT(ps) (ps<> PTE_TSIZE_SHIFT) & PTE_TSIZE_MASK) #define PTE_TSIZE_DIRECT(pte) (int)((*pte >> PTE_TSIZE_SHIFT_DIRECT) & PTE_TSIZE_MASK_DIRECT) /* Macro argument must of pte_t type. */ #define PTE_ARPN_SHIFT 12 #define PTE_FLAGS_MASK 0x00ffffff #define PTE_RPN_FROM_PA(pa) (((pa) & ~PAGE_MASK) << PTE_ARPN_SHIFT) #define PTE_PA(pte) ((vm_paddr_t)(*pte >> PTE_ARPN_SHIFT) & ~PAGE_MASK) #define PTE_ISVALID(pte) ((*pte) & PTE_VALID) #define PTE_ISWIRED(pte) ((*pte) & PTE_WIRED) #define PTE_ISMANAGED(pte) ((*pte) & PTE_MANAGED) #define PTE_ISMODIFIED(pte) ((*pte) & PTE_MODIFIED) #define PTE_ISREFERENCED(pte) ((*pte) & PTE_REFERENCED) #endif /* BOOKE */ /* Book-E page table format, broken out for the generic pmap.h. */ #ifdef __powerpc64__ #include /* * The virtual address is: * * 4K page size * +-----+-----------+-------+-------------+-------------+----------------+ * | - | pg_root |pdir_l1| dir# | pte# | off in 4K page | * +-----+-----------+-------+-------------+-------------+----------------+ * 63 52 51 39 38 30 29 ^ 21 20 ^ 12 11 0 * | | * index in 1 page of pointers * * 1st level - Root page table * * pp2d consists of PG_ROOT_NENTRIES entries, each being a pointer to * second level entity, i.e. the page table directory (pdir). */ #define PG_ROOT_H 51 #define PG_ROOT_L 39 #define PG_ROOT_SIZE (1UL << PG_ROOT_L) /* va range mapped by pp2d */ #define PG_ROOT_SHIFT PG_ROOT_L #define PG_ROOT_NUM (PG_ROOT_H - PG_ROOT_L + 1) #define PG_ROOT_MASK ((1 << PG_ROOT_NUM) - 1) #define PG_ROOT_IDX(va) ((va >> PG_ROOT_SHIFT) & PG_ROOT_MASK) #define PG_ROOT_NENTRIES (1 << PG_ROOT_NUM) #define PG_ROOT_ENTRY_SHIFT 3 /* log2 (sizeof(struct pte_entry **)) */ /* * 2nd level - page directory directory (pdir l1) * * pdir consists of PDIR_NENTRIES entries, each being a pointer to * second level entity, i.e. the actual page table (ptbl). */ #define PDIR_L1_H (PG_ROOT_L-1) #define PDIR_L1_L 30 #define PDIR_L1_NUM (PDIR_L1_H-PDIR_L1_L+1) #define PDIR_L1_SIZE (1 << PDIR_L1_L) /* va range mapped by pdir */ #define PDIR_L1_MASK ((1<> PDIR_L1_SHIFT) & PDIR_L1_MASK) #define PDIR_L1_ENTRY_SHIFT 3 /* log2 (sizeof(struct pte_entry *)) */ #define PDIR_L1_PAGES ((PDIR_L1_NENTRIES * (1<> PDIR_SHIFT) & PDIR_MASK) #define PDIR_ENTRY_SHIFT 3 /* log2 (sizeof(struct pte_entry *)) */ #define PDIR_PAGES ((PDIR_NENTRIES * (1<> PTBL_SHIFT) & PTBL_MASK) #define PTBL_ENTRY_SHIFT 3 /* log2 (sizeof (struct pte_entry)) */ #define PTBL_PAGES ((PTBL_NENTRIES * (1<> PDIR_SHIFT) #define PDIR_ENTRY_SHIFT 2 /* entry size is 2^2 = 4 bytes */ /* * 2nd level - page table (ptbl) * * Page table covers 1024 page table entries. Page * table entry (pte) is 32 bit wide and defines mapping * for a single page. */ #define PTBL_SHIFT PAGE_SHIFT #define PTBL_SIZE PAGE_SIZE /* va range mapped by ptbl entry */ #define PTBL_MASK ((PDIR_SIZE - 1) & ~((1 << PAGE_SHIFT) - 1)) #define PTBL_NENTRIES 1024 /* number of pages mapped by ptbl */ /* Returns ptbl entry number for given va */ #define PTBL_IDX(va) (((va) & PTBL_MASK) >> PTBL_SHIFT) /* Size of ptbl in pages, 1024 entries, each sizeof(struct pte_entry). */ #define PTBL_PAGES 2 #define PTBL_ENTRY_SHIFT 3 /* entry size is 2^3 = 8 bytes */ #endif #endif /* _MACHINE_PTE_H_ */ Index: head/sys/powerpc/include/spr.h =================================================================== --- head/sys/powerpc/include/spr.h (revision 360886) +++ head/sys/powerpc/include/spr.h (revision 360887) @@ -1,890 +1,901 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2001 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: spr.h,v 1.25 2002/08/14 15:38:40 matt Exp $ * $FreeBSD$ */ #ifndef _POWERPC_SPR_H_ #define _POWERPC_SPR_H_ #ifndef _LOCORE #define mtspr(reg, val) \ __asm __volatile("mtspr %0,%1" : : "K"(reg), "r"(val)) #define mfspr(reg) \ ( { register_t val; \ __asm __volatile("mfspr %0,%1" : "=r"(val) : "K"(reg)); \ val; } ) #ifndef __powerpc64__ /* The following routines allow manipulation of the full 64-bit width * of SPRs on 64 bit CPUs in bridge mode */ #define mtspr64(reg,valhi,vallo,scratch) \ __asm __volatile(" \ mfmsr %0; \ insrdi %0,%5,1,0; \ mtmsrd %0; \ isync; \ \ sld %1,%1,%4; \ or %1,%1,%2; \ mtspr %3,%1; \ srd %1,%1,%4; \ \ clrldi %0,%0,1; \ mtmsrd %0; \ isync;" \ : "=r"(scratch), "=r"(valhi) : "r"(vallo), "K"(reg), "r"(32), "r"(1)) #define mfspr64upper(reg,scratch) \ ( { register_t val; \ __asm __volatile(" \ mfmsr %0; \ insrdi %0,%4,1,0; \ mtmsrd %0; \ isync; \ \ mfspr %1,%2; \ srd %1,%1,%3; \ \ clrldi %0,%0,1; \ mtmsrd %0; \ isync;" \ : "=r"(scratch), "=r"(val) : "K"(reg), "r"(32), "r"(1)); \ val; } ) #endif #endif /* _LOCORE */ /* * Special Purpose Register declarations. * * The first column in the comments indicates which PowerPC * architectures the SPR is valid on - 4 for 4xx series, * 6 for 6xx/7xx series and 8 for 8xx and 8xxx series. */ #define SPR_MQ 0x000 /* .6. 601 MQ register */ #define SPR_XER 0x001 /* 468 Fixed Point Exception Register */ #define SPR_DSCR 0x003 /* .6. Data Stream Control Register (Unprivileged) */ #define SPR_RTCU_R 0x004 /* .6. 601 RTC Upper - Read */ #define SPR_RTCL_R 0x005 /* .6. 601 RTC Lower - Read */ #define SPR_LR 0x008 /* 468 Link Register */ #define SPR_CTR 0x009 /* 468 Count Register */ #define SPR_DSCRP 0x011 /* Data Stream Control Register (Privileged) */ #define SPR_DSISR 0x012 /* .68 DSI exception source */ #define DSISR_DIRECT 0x80000000 /* Direct-store error exception */ #define DSISR_NOTFOUND 0x40000000 /* Translation not found */ #define DSISR_PROTECT 0x08000000 /* Memory access not permitted */ #define DSISR_INVRX 0x04000000 /* Reserve-indexed insn direct-store access */ #define DSISR_STORE 0x02000000 /* Store operation */ #define DSISR_DABR 0x00400000 /* DABR match */ #define DSISR_SEGMENT 0x00200000 /* XXX; not in 6xx PEM */ #define DSISR_EAR 0x00100000 /* eciwx/ecowx && EAR[E] == 0 */ #define DSISR_MC_UE_DEFERRED 0x00008000 /* UE deferred error */ #define DSISR_MC_UE_TABLEWALK 0x00004000 /* UE deferred error during tablewalk */ #define DSISR_MC_DERAT_MULTIHIT 0x00000800 /* D-ERAT multi-hit */ #define DSISR_MC_TLB_MULTIHIT 0x00000400 /* TLB multi-hit */ #define DSISR_MC_TLBIE_ERR 0x00000200 /* TLBIE or TLBIEL programming error */ #define DSISR_MC_SLB_PARITY 0x00000100 /* SLB parity error */ #define DSISR_MC_SLB_MULTIHIT 0x00000080 /* SLB Multi-hit detected (D-side) */ #define DSISR_MC_BAD_REAL_LD 0x00000040 /* Bad real address for load. */ #define DSISR_MC_BAD_ADDR 0x00000020 /* Bad address for load or store tablewalk */ #define SPR_DAR 0x013 /* .68 Data Address Register */ #define SPR_RTCU_W 0x014 /* .6. 601 RTC Upper - Write */ #define SPR_RTCL_W 0x015 /* .6. 601 RTC Lower - Write */ #define SPR_DEC 0x016 /* .68 DECrementer register */ #define SPR_SDR1 0x019 /* .68 Page table base address register */ #define SPR_SRR0 0x01a /* 468 Save/Restore Register 0 */ #define SPR_SRR1 0x01b /* 468 Save/Restore Register 1 */ #define SRR1_ISI_PFAULT 0x40000000 /* ISI page not found */ #define SRR1_ISI_NOEXECUTE 0x10000000 /* Memory marked no-execute */ #define SRR1_ISI_PP 0x08000000 /* PP bits forbid access */ #define SRR1_MCHK_DATA 0x00200000 /* Machine check data in DSISR */ #define SRR1_MCHK_IFETCH_M 0x081c0000 /* Machine check instr fetch mask */ #define SRR1_MCHK_IFETCH_SLBMH 0x000c0000 /* SLB multihit */ +#define SPR_CFAR 0x01c /* Come From Address Register */ +#define SPR_AMR 0x01d /* Authority Mask Register */ + +#define SPR_PID 0x030 /* 4.. Process ID */ + #define SPR_DECAR 0x036 /* ..8 Decrementer auto reload */ +#define SPR_IAMR 0x03d /* Instr. Authority Mask Reg */ + #define SPR_EIE 0x050 /* ..8 Exception Interrupt ??? */ #define SPR_EID 0x051 /* ..8 Exception Interrupt ??? */ #define SPR_NRI 0x052 /* ..8 Exception Interrupt ??? */ #define SPR_FSCR 0x099 /* Facility Status and Control Register */ #define FSCR_IC_MASK 0xFF00000000000000ULL /* FSCR[0:7] is Interrupt Cause */ #define FSCR_IC_FP 0x0000000000000000ULL /* FP unavailable */ #define FSCR_IC_VSX 0x0100000000000000ULL /* VSX unavailable */ #define FSCR_IC_DSCR 0x0200000000000000ULL /* Access to the DSCR at SPRs 3 or 17 */ #define FSCR_IC_PM 0x0300000000000000ULL /* Read or write access of a Performance Monitor SPR in group A */ #define FSCR_IC_BHRB 0x0400000000000000ULL /* Execution of a BHRB Instruction */ #define FSCR_IC_HTM 0x0500000000000000ULL /* Access to a Transactional Memory */ /* Reserved 0x0600000000000000ULL */ #define FSCR_IC_EBB 0x0700000000000000ULL /* Access to Event-Based Branch */ #define FSCR_IC_TAR 0x0800000000000000ULL /* Access to Target Address Register */ #define FSCR_IC_STOP 0x0900000000000000ULL /* Access to the 'stop' instruction in privileged non-hypervisor state */ #define FSCR_IC_MSG 0x0A00000000000000ULL /* Access to 'msgsndp' or 'msgclrp' instructions */ #define FSCR_IC_LM 0x0A00000000000000ULL /* Access to load monitored facility */ #define FSCR_IC_SCV 0x0C00000000000000ULL /* Execution of a 'scv' instruction */ #define FSCR_SCV 0x0000000000001000 /* scv instruction available */ #define FSCR_LM 0x0000000000000800 /* Load monitored facilities available */ #define FSCR_MSGP 0x0000000000000400 /* msgsndp and SPRs available */ #define FSCR_TAR 0x0000000000000100 /* TAR register available */ #define FSCR_EBB 0x0000000000000080 /* Event-based branch available */ #define FSCR_DSCR 0x0000000000000004 /* DSCR available in PR state */ +#define SPR_UAMOR 0x09d /* User Authority Mask Override Register */ #define SPR_DPDES 0x0b0 /* .6. Directed Privileged Doorbell Exception State Register */ #define SPR_USPRG0 0x100 /* 4.8 User SPR General 0 */ #define SPR_VRSAVE 0x100 /* .6. AltiVec VRSAVE */ #define SPR_SPRG0 0x110 /* 468 SPR General 0 */ #define SPR_SPRG1 0x111 /* 468 SPR General 1 */ #define SPR_SPRG2 0x112 /* 468 SPR General 2 */ #define SPR_SPRG3 0x113 /* 468 SPR General 3 */ #define SPR_SPRG4 0x114 /* 4.8 SPR General 4 */ #define SPR_SPRG5 0x115 /* 4.8 SPR General 5 */ #define SPR_SPRG6 0x116 /* 4.8 SPR General 6 */ #define SPR_SPRG7 0x117 /* 4.8 SPR General 7 */ #define SPR_SCOMC 0x114 /* ... SCOM Address Register (970) */ #define SPR_SCOMD 0x115 /* ... SCOM Data Register (970) */ #define SPR_ASR 0x118 /* ... Address Space Register (PPC64) */ #define SPR_EAR 0x11a /* .68 External Access Register */ #define SPR_PVR 0x11f /* 468 Processor Version Register */ #define MPC601 0x0001 #define MPC603 0x0003 #define MPC604 0x0004 #define MPC602 0x0005 #define MPC603e 0x0006 #define MPC603ev 0x0007 #define MPC750 0x0008 #define MPC750CL 0x7000 /* Nintendo Wii's Broadway */ #define MPC604ev 0x0009 #define MPC7400 0x000c #define MPC620 0x0014 #define IBM403 0x0020 #define IBM401A1 0x0021 #define IBM401B2 0x0022 #define IBM401C2 0x0023 #define IBM401D2 0x0024 #define IBM401E2 0x0025 #define IBM401F2 0x0026 #define IBM401G2 0x0027 #define IBMRS64II 0x0033 #define IBMRS64III 0x0034 #define IBMPOWER4 0x0035 #define IBMRS64III_2 0x0036 #define IBMRS64IV 0x0037 #define IBMPOWER4PLUS 0x0038 #define IBM970 0x0039 #define IBMPOWER5 0x003a #define IBMPOWER5PLUS 0x003b #define IBM970FX 0x003c #define IBMPOWER6 0x003e #define IBMPOWER7 0x003f #define IBMPOWER3 0x0040 #define IBMPOWER3PLUS 0x0041 #define IBM970MP 0x0044 #define IBM970GX 0x0045 #define IBMPOWERPCA2 0x0049 #define IBMPOWER7PLUS 0x004a #define IBMPOWER8E 0x004b #define IBMPOWER8NVL 0x004c #define IBMPOWER8 0x004d #define IBMPOWER9 0x004e #define MPC860 0x0050 #define IBMCELLBE 0x0070 #define MPC8240 0x0081 #define PA6T 0x0090 #define IBM405GP 0x4011 #define IBM405L 0x4161 #define IBM750FX 0x7000 #define MPC745X_P(v) ((v & 0xFFF8) == 0x8000) #define MPC7450 0x8000 #define MPC7455 0x8001 #define MPC7457 0x8002 #define MPC7447A 0x8003 #define MPC7448 0x8004 #define MPC7410 0x800c #define MPC8245 0x8081 #define FSL_E500v1 0x8020 #define FSL_E500v2 0x8021 #define FSL_E500mc 0x8023 #define FSL_E5500 0x8024 #define FSL_E6500 0x8040 #define FSL_E300C1 0x8083 #define FSL_E300C2 0x8084 #define FSL_E300C3 0x8085 #define FSL_E300C4 0x8086 #define LPCR_PECE_WAKESET (LPCR_PECE_EXT | LPCR_PECE_DECR | LPCR_PECE_ME) #define SPR_DBSR 0x130 /* ..8 Debug Status Register */ #define DBSR_IDE 0x80000000 /* Imprecise debug event. */ #define DBSR_UDE 0x40000000 /* Unconditional debug event. */ #define DBSR_MRR 0x30000000 /* Most recent Reset (mask). */ #define DBSR_ICMP 0x08000000 /* Instr. complete debug event. */ #define DBSR_BRT 0x04000000 /* Branch taken debug event. */ #define DBSR_IRPT 0x02000000 /* Interrupt taken debug event. */ #define DBSR_TRAP 0x01000000 /* Trap instr. debug event. */ #define DBSR_IAC1 0x00800000 /* Instr. address compare #1. */ #define DBSR_IAC2 0x00400000 /* Instr. address compare #2. */ #define DBSR_IAC3 0x00200000 /* Instr. address compare #3. */ #define DBSR_IAC4 0x00100000 /* Instr. address compare #4. */ #define DBSR_DAC1R 0x00080000 /* Data addr. read compare #1. */ #define DBSR_DAC1W 0x00040000 /* Data addr. write compare #1. */ #define DBSR_DAC2R 0x00020000 /* Data addr. read compare #2. */ #define DBSR_DAC2W 0x00010000 /* Data addr. write compare #2. */ #define DBSR_RET 0x00008000 /* Return debug event. */ #define SPR_EPCR 0x133 #define EPCR_EXTGS 0x80000000 #define EPCR_DTLBGS 0x40000000 #define EPCR_ITLBGS 0x20000000 #define EPCR_DSIGS 0x10000000 #define EPCR_ISIGS 0x08000000 #define EPCR_DUVGS 0x04000000 #define EPCR_ICM 0x02000000 #define EPCR_GICMGS 0x01000000 #define EPCR_DGTMI 0x00800000 #define EPCR_DMIUH 0x00400000 #define EPCR_PMGS 0x00200000 #define SPR_DBCR0 0x134 /* ..8 Debug Control Register 0 */ #define SPR_DBCR1 0x135 /* ..8 Debug Control Register 1 */ #define SPR_IAC1 0x138 /* ..8 Instruction Address Compare 1 */ #define SPR_IAC2 0x139 /* ..8 Instruction Address Compare 2 */ #define SPR_IAC3 0x13a /* ..8 Instruction Address Compare 3 */ #define SPR_IAC4 0x13b /* ..8 Instruction Address Compare 4 */ #define SPR_HSRR0 0x13a #define SPR_HSRR1 0x13b #define SPR_DAC1 0x13c /* ..8 Data Address Compare 1 */ #define SPR_DAC2 0x13d /* ..8 Data Address Compare 2 */ #define SPR_DVC1 0x13e /* ..8 Data Value Compare 1 */ #define SPR_DVC2 0x13f /* ..8 Data Value Compare 2 */ #define SPR_LPCR 0x13e /* .6. Logical Partitioning Control */ #define LPCR_LPES 0x008 /* Bit 60 */ #define LPCR_HVICE 0x002 /* Hypervisor Virtualization Interrupt (Arch 3.0) */ +#define LPCR_UPRT (1ULL << 22) /* Use Process Table (ISA 3) */ +#define LPCR_HR (1ULL << 20) /* Host Radix mode */ #define LPCR_PECE_DRBL (1ULL << 16) /* Directed Privileged Doorbell */ #define LPCR_PECE_HDRBL (1ULL << 15) /* Directed Hypervisor Doorbell */ #define LPCR_PECE_EXT (1ULL << 14) /* External exceptions */ #define LPCR_PECE_DECR (1ULL << 13) /* Decrementer exceptions */ #define LPCR_PECE_ME (1ULL << 12) /* Machine Check and Hypervisor */ /* Maintenance exceptions */ #define SPR_LPID 0x13f /* .6. Logical Partitioning Control */ #define SPR_HMER 0x150 /* Hypervisor Maintenance Exception Register */ #define SPR_HMEER 0x151 /* Hypervisor Maintenance Exception Enable Register */ +#define SPR_AMOR 0x15d /* Authority Mask Override Register */ #define SPR_TIR 0x1be /* .6. Thread Identification Register */ #define SPR_PTCR 0x1d0 /* Partition Table Control Register */ #define SPR_SPEFSCR 0x200 /* ..8 Signal Processing Engine FSCR. */ #define SPEFSCR_SOVH 0x80000000 #define SPEFSCR_OVH 0x40000000 #define SPEFSCR_FGH 0x20000000 #define SPEFSCR_FXH 0x10000000 #define SPEFSCR_FINVH 0x08000000 #define SPEFSCR_FDBZH 0x04000000 #define SPEFSCR_FUNFH 0x02000000 #define SPEFSCR_FOVFH 0x01000000 #define SPEFSCR_FINXS 0x00200000 #define SPEFSCR_FINVS 0x00100000 #define SPEFSCR_FDBZS 0x00080000 #define SPEFSCR_FUNFS 0x00040000 #define SPEFSCR_FOVFS 0x00020000 #define SPEFSCR_SOV 0x00008000 #define SPEFSCR_OV 0x00004000 #define SPEFSCR_FG 0x00002000 #define SPEFSCR_FX 0x00001000 #define SPEFSCR_FINV 0x00000800 #define SPEFSCR_FDBZ 0x00000400 #define SPEFSCR_FUNF 0x00000200 #define SPEFSCR_FOVF 0x00000100 #define SPEFSCR_FINXE 0x00000040 #define SPEFSCR_FINVE 0x00000020 #define SPEFSCR_FDBZE 0x00000010 #define SPEFSCR_FUNFE 0x00000008 #define SPEFSCR_FOVFE 0x00000004 #define SPEFSCR_FRMC_M 0x00000003 #define SPR_IBAT0U 0x210 /* .6. Instruction BAT Reg 0 Upper */ #define SPR_IBAT0L 0x211 /* .6. Instruction BAT Reg 0 Lower */ #define SPR_IBAT1U 0x212 /* .6. Instruction BAT Reg 1 Upper */ #define SPR_IBAT1L 0x213 /* .6. Instruction BAT Reg 1 Lower */ #define SPR_IBAT2U 0x214 /* .6. Instruction BAT Reg 2 Upper */ #define SPR_IBAT2L 0x215 /* .6. Instruction BAT Reg 2 Lower */ #define SPR_IBAT3U 0x216 /* .6. Instruction BAT Reg 3 Upper */ #define SPR_IBAT3L 0x217 /* .6. Instruction BAT Reg 3 Lower */ #define SPR_DBAT0U 0x218 /* .6. Data BAT Reg 0 Upper */ #define SPR_DBAT0L 0x219 /* .6. Data BAT Reg 0 Lower */ #define SPR_DBAT1U 0x21a /* .6. Data BAT Reg 1 Upper */ #define SPR_DBAT1L 0x21b /* .6. Data BAT Reg 1 Lower */ #define SPR_DBAT2U 0x21c /* .6. Data BAT Reg 2 Upper */ #define SPR_DBAT2L 0x21d /* .6. Data BAT Reg 2 Lower */ #define SPR_DBAT3U 0x21e /* .6. Data BAT Reg 3 Upper */ #define SPR_DBAT3L 0x21f /* .6. Data BAT Reg 3 Lower */ #define SPR_IC_CST 0x230 /* ..8 Instruction Cache CSR */ #define IC_CST_IEN 0x80000000 /* I cache is ENabled (RO) */ #define IC_CST_CMD_INVALL 0x0c000000 /* I cache invalidate all */ #define IC_CST_CMD_UNLOCKALL 0x0a000000 /* I cache unlock all */ #define IC_CST_CMD_UNLOCK 0x08000000 /* I cache unlock block */ #define IC_CST_CMD_LOADLOCK 0x06000000 /* I cache load & lock block */ #define IC_CST_CMD_DISABLE 0x04000000 /* I cache disable */ #define IC_CST_CMD_ENABLE 0x02000000 /* I cache enable */ #define IC_CST_CCER1 0x00200000 /* I cache error type 1 (RO) */ #define IC_CST_CCER2 0x00100000 /* I cache error type 2 (RO) */ #define IC_CST_CCER3 0x00080000 /* I cache error type 3 (RO) */ #define SPR_IBAT4U 0x230 /* .6. Instruction BAT Reg 4 Upper */ #define SPR_IC_ADR 0x231 /* ..8 Instruction Cache Address */ #define SPR_IBAT4L 0x231 /* .6. Instruction BAT Reg 4 Lower */ #define SPR_IC_DAT 0x232 /* ..8 Instruction Cache Data */ #define SPR_IBAT5U 0x232 /* .6. Instruction BAT Reg 5 Upper */ #define SPR_IBAT5L 0x233 /* .6. Instruction BAT Reg 5 Lower */ #define SPR_IBAT6U 0x234 /* .6. Instruction BAT Reg 6 Upper */ #define SPR_IBAT6L 0x235 /* .6. Instruction BAT Reg 6 Lower */ #define SPR_IBAT7U 0x236 /* .6. Instruction BAT Reg 7 Upper */ #define SPR_IBAT7L 0x237 /* .6. Instruction BAT Reg 7 Lower */ #define SPR_DC_CST 0x230 /* ..8 Data Cache CSR */ #define DC_CST_DEN 0x80000000 /* D cache ENabled (RO) */ #define DC_CST_DFWT 0x40000000 /* D cache Force Write-Thru (RO) */ #define DC_CST_LES 0x20000000 /* D cache Little Endian Swap (RO) */ #define DC_CST_CMD_FLUSH 0x0e000000 /* D cache invalidate all */ #define DC_CST_CMD_INVALL 0x0c000000 /* D cache invalidate all */ #define DC_CST_CMD_UNLOCKALL 0x0a000000 /* D cache unlock all */ #define DC_CST_CMD_UNLOCK 0x08000000 /* D cache unlock block */ #define DC_CST_CMD_CLRLESWAP 0x07000000 /* D cache clr little-endian swap */ #define DC_CST_CMD_LOADLOCK 0x06000000 /* D cache load & lock block */ #define DC_CST_CMD_SETLESWAP 0x05000000 /* D cache set little-endian swap */ #define DC_CST_CMD_DISABLE 0x04000000 /* D cache disable */ #define DC_CST_CMD_CLRFWT 0x03000000 /* D cache clear forced write-thru */ #define DC_CST_CMD_ENABLE 0x02000000 /* D cache enable */ #define DC_CST_CMD_SETFWT 0x01000000 /* D cache set forced write-thru */ #define DC_CST_CCER1 0x00200000 /* D cache error type 1 (RO) */ #define DC_CST_CCER2 0x00100000 /* D cache error type 2 (RO) */ #define DC_CST_CCER3 0x00080000 /* D cache error type 3 (RO) */ #define SPR_DBAT4U 0x238 /* .6. Data BAT Reg 4 Upper */ #define SPR_DC_ADR 0x231 /* ..8 Data Cache Address */ #define SPR_DBAT4L 0x239 /* .6. Data BAT Reg 4 Lower */ #define SPR_DC_DAT 0x232 /* ..8 Data Cache Data */ #define SPR_DBAT5U 0x23a /* .6. Data BAT Reg 5 Upper */ #define SPR_DBAT5L 0x23b /* .6. Data BAT Reg 5 Lower */ #define SPR_DBAT6U 0x23c /* .6. Data BAT Reg 6 Upper */ #define SPR_DBAT6L 0x23d /* .6. Data BAT Reg 6 Lower */ #define SPR_DBAT7U 0x23e /* .6. Data BAT Reg 7 Upper */ #define SPR_DBAT7L 0x23f /* .6. Data BAT Reg 7 Lower */ #define SPR_SPRG8 0x25c /* ..8 SPR General 8 */ #define SPR_MI_CTR 0x310 /* ..8 IMMU control */ #define Mx_CTR_GPM 0x80000000 /* Group Protection Mode */ #define Mx_CTR_PPM 0x40000000 /* Page Protection Mode */ #define Mx_CTR_CIDEF 0x20000000 /* Cache-Inhibit DEFault */ #define MD_CTR_WTDEF 0x20000000 /* Write-Through DEFault */ #define Mx_CTR_RSV4 0x08000000 /* Reserve 4 TLB entries */ #define MD_CTR_TWAM 0x04000000 /* TableWalk Assist Mode */ #define Mx_CTR_PPCS 0x02000000 /* Priv/user state compare mode */ #define Mx_CTR_TLB_INDX 0x000001f0 /* TLB index mask */ #define Mx_CTR_TLB_INDX_BITPOS 8 /* TLB index shift */ #define SPR_MI_AP 0x312 /* ..8 IMMU access protection */ #define Mx_GP_SUPER(n) (0 << (2*(15-(n)))) /* access is supervisor */ #define Mx_GP_PAGE (1 << (2*(15-(n)))) /* access is page protect */ #define Mx_GP_SWAPPED (2 << (2*(15-(n)))) /* access is swapped */ #define Mx_GP_USER (3 << (2*(15-(n)))) /* access is user */ #define SPR_MI_EPN 0x313 /* ..8 IMMU effective number */ #define Mx_EPN_EPN 0xfffff000 /* Effective Page Number mask */ #define Mx_EPN_EV 0x00000020 /* Entry Valid */ #define Mx_EPN_ASID 0x0000000f /* Address Space ID */ #define SPR_MI_TWC 0x315 /* ..8 IMMU tablewalk control */ #define MD_TWC_L2TB 0xfffff000 /* Level-2 Tablewalk Base */ #define Mx_TWC_APG 0x000001e0 /* Access Protection Group */ #define Mx_TWC_G 0x00000010 /* Guarded memory */ #define Mx_TWC_PS 0x0000000c /* Page Size (L1) */ #define MD_TWC_WT 0x00000002 /* Write-Through */ #define Mx_TWC_V 0x00000001 /* Entry Valid */ #define SPR_MI_RPN 0x316 /* ..8 IMMU real (phys) page number */ #define Mx_RPN_RPN 0xfffff000 /* Real Page Number */ #define Mx_RPN_PP 0x00000ff0 /* Page Protection */ #define Mx_RPN_SPS 0x00000008 /* Small Page Size */ #define Mx_RPN_SH 0x00000004 /* SHared page */ #define Mx_RPN_CI 0x00000002 /* Cache Inhibit */ #define Mx_RPN_V 0x00000001 /* Valid */ #define SPR_MD_CTR 0x318 /* ..8 DMMU control */ #define SPR_M_CASID 0x319 /* ..8 CASID */ #define M_CASID 0x0000000f /* Current AS Id */ #define SPR_MD_AP 0x31a /* ..8 DMMU access protection */ #define SPR_MD_EPN 0x31b /* ..8 DMMU effective number */ #define SPR_970MMCR0 0x31b /* ... Monitor Mode Control Register 0 (PPC 970) */ #define SPR_970MMCR0_PMC1SEL(x) ((x) << 8) /* PMC1 selector (970) */ #define SPR_970MMCR0_PMC2SEL(x) ((x) << 1) /* PMC2 selector (970) */ #define SPR_970MMCR1 0x31e /* ... Monitor Mode Control Register 1 (PPC 970) */ #define SPR_970MMCR1_PMC3SEL(x) (((x) & 0x1f) << 27) /* PMC 3 selector */ #define SPR_970MMCR1_PMC4SEL(x) (((x) & 0x1f) << 22) /* PMC 4 selector */ #define SPR_970MMCR1_PMC5SEL(x) (((x) & 0x1f) << 17) /* PMC 5 selector */ #define SPR_970MMCR1_PMC6SEL(x) (((x) & 0x1f) << 12) /* PMC 6 selector */ #define SPR_970MMCR1_PMC7SEL(x) (((x) & 0x1f) << 7) /* PMC 7 selector */ #define SPR_970MMCR1_PMC8SEL(x) (((x) & 0x1f) << 2) /* PMC 8 selector */ #define SPR_970MMCRA 0x312 /* ... Monitor Mode Control Register 2 (PPC 970) */ #define SPR_970PMC1 0x313 /* ... PMC 1 */ #define SPR_970PMC2 0x314 /* ... PMC 2 */ #define SPR_970PMC3 0x315 /* ... PMC 3 */ #define SPR_970PMC4 0x316 /* ... PMC 4 */ #define SPR_970PMC5 0x317 /* ... PMC 5 */ #define SPR_970PMC6 0x318 /* ... PMC 6 */ #define SPR_970PMC7 0x319 /* ... PMC 7 */ #define SPR_970PMC8 0x31a /* ... PMC 8 */ #define SPR_M_TWB 0x31c /* ..8 MMU tablewalk base */ #define M_TWB_L1TB 0xfffff000 /* level-1 translation base */ #define M_TWB_L1INDX 0x00000ffc /* level-1 index */ #define SPR_MD_TWC 0x31d /* ..8 DMMU tablewalk control */ #define SPR_MD_RPN 0x31e /* ..8 DMMU real (phys) page number */ #define SPR_MD_TW 0x31f /* ..8 MMU tablewalk scratch */ #define SPR_BESCRS 0x320 /* .6. Branch Event Status and Control Set Register */ #define SPR_BESCRSU 0x321 /* .6. Branch Event Status and Control Set Register (upper 32-bit) */ #define SPR_BESCRR 0x322 /* .6. Branch Event Status and Control Reset Register */ #define SPR_BESCRRU 0x323 /* .6. Branch Event Status and Control Register (upper 32-bit) */ #define SPR_EBBHR 0x324 /* .6. Event-based Branch Handler Register */ #define SPR_EBBRR 0x325 /* .6. Event-based Branch Return Register */ #define SPR_BESCR 0x326 /* .6. Branch Event Status and Control Register */ #define SPR_LMRR 0x32d /* .6. Load Monitored Region Register */ #define SPR_LMSER 0x32e /* .6. Load Monitored Section Enable Register */ #define SPR_TAR 0x32f /* .6. Branch Target Address Register */ #define SPR_MI_CAM 0x330 /* ..8 IMMU CAM entry read */ #define SPR_MI_RAM0 0x331 /* ..8 IMMU RAM entry read reg 0 */ #define SPR_MI_RAM1 0x332 /* ..8 IMMU RAM entry read reg 1 */ #define SPR_MD_CAM 0x338 /* ..8 IMMU CAM entry read */ #define SPR_MD_RAM0 0x339 /* ..8 IMMU RAM entry read reg 0 */ #define SPR_MD_RAM1 0x33a /* ..8 IMMU RAM entry read reg 1 */ #define SPR_PSSCR 0x357 /* Processor Stop Status and Control Register (ISA 3.0) */ #define PSSCR_PLS_S 60 #define PSSCR_PLS_M (0xf << PSSCR_PLS_S) #define PSSCR_SD (1 << 22) #define PSSCR_ESL (1 << 21) #define PSSCR_EC (1 << 20) #define PSSCR_PSLL_S 16 #define PSSCR_PSLL_M (0xf << PSSCR_PSLL_S) #define PSSCR_TR_S 8 #define PSSCR_TR_M (0x3 << PSSCR_TR_S) #define PSSCR_MTL_S 4 #define PSSCR_MTL_M (0xf << PSSCR_MTL_S) #define PSSCR_RL_S 0 #define PSSCR_RL_M (0xf << PSSCR_RL_S) #define SPR_PMCR 0x374 /* Processor Management Control Register */ #define SPR_UMMCR2 0x3a0 /* .6. User Monitor Mode Control Register 2 */ #define SPR_UMMCR0 0x3a8 /* .6. User Monitor Mode Control Register 0 */ #define SPR_USIA 0x3ab /* .6. User Sampled Instruction Address */ #define SPR_UMMCR1 0x3ac /* .6. User Monitor Mode Control Register 1 */ #define SPR_MMCR2 0x3b0 /* .6. Monitor Mode Control Register 2 */ #define SPR_MMCR2_THRESHMULT_32 0x80000000 /* Multiply MMCR0 threshold by 32 */ #define SPR_MMCR2_THRESHMULT_2 0x00000000 /* Multiply MMCR0 threshold by 2 */ #define SPR_PMC5 0x3b1 /* .6. Performance Counter Register 5 */ #define SPR_PMC6 0x3b2 /* .6. Performance Counter Register 6 */ #define SPR_MMCR0 0x3b8 /* .6. Monitor Mode Control Register 0 */ #define SPR_MMCR0_FC 0x80000000 /* Freeze counters */ #define SPR_MMCR0_FCS 0x40000000 /* Freeze counters in supervisor mode */ #define SPR_MMCR0_FCP 0x20000000 /* Freeze counters in user mode */ #define SPR_MMCR0_FCM1 0x10000000 /* Freeze counters when mark=1 */ #define SPR_MMCR0_FCM0 0x08000000 /* Freeze counters when mark=0 */ #define SPR_MMCR0_PMXE 0x04000000 /* Enable PM interrupt */ #define SPR_MMCR0_FCECE 0x02000000 /* Freeze counters after event */ #define SPR_MMCR0_TBSEL_15 0x01800000 /* Count bit 15 of TBL */ #define SPR_MMCR0_TBSEL_19 0x01000000 /* Count bit 19 of TBL */ #define SPR_MMCR0_TBSEL_23 0x00800000 /* Count bit 23 of TBL */ #define SPR_MMCR0_TBSEL_31 0x00000000 /* Count bit 31 of TBL */ #define SPR_MMCR0_TBEE 0x00400000 /* Time-base event enable */ #define SPR_MMCRO_THRESHOLD(x) ((x) << 16) /* Threshold value */ #define SPR_MMCR0_PMC1CE 0x00008000 /* PMC1 condition enable */ #define SPR_MMCR0_PMCNCE 0x00004000 /* PMCn condition enable */ #define SPR_MMCR0_TRIGGER 0x00002000 /* Trigger */ #define SPR_MMCR0_PMC1SEL(x) (((x) & 0x3f) << 6) /* PMC1 selector */ #define SPR_MMCR0_PMC2SEL(x) (((x) & 0x3f) << 0) /* PMC2 selector */ #define SPR_PMC1 0x3b9 /* .6. Performance Counter Register 1 */ #define SPR_PMC2 0x3ba /* .6. Performance Counter Register 2 */ #define SPR_SIA 0x3bb /* .6. Sampled Instruction Address */ #define SPR_MMCR1 0x3bc /* .6. Monitor Mode Control Register 2 */ #define SPR_MMCR1_PMC3SEL(x) (((x) & 0x1f) << 27) /* PMC 3 selector */ #define SPR_MMCR1_PMC4SEL(x) (((x) & 0x1f) << 22) /* PMC 4 selector */ #define SPR_MMCR1_PMC5SEL(x) (((x) & 0x1f) << 17) /* PMC 5 selector */ #define SPR_MMCR1_PMC6SEL(x) (((x) & 0x3f) << 11) /* PMC 6 selector */ #define SPR_PMC3 0x3bd /* .6. Performance Counter Register 3 */ #define SPR_PMC4 0x3be /* .6. Performance Counter Register 4 */ #define SPR_DMISS 0x3d0 /* .68 Data TLB Miss Address Register */ #define SPR_DCMP 0x3d1 /* .68 Data TLB Compare Register */ #define SPR_HASH1 0x3d2 /* .68 Primary Hash Address Register */ #define SPR_HASH2 0x3d3 /* .68 Secondary Hash Address Register */ #define SPR_IMISS 0x3d4 /* .68 Instruction TLB Miss Address Register */ #define SPR_TLBMISS 0x3d4 /* .6. TLB Miss Address Register */ #define SPR_DEAR 0x03d /* ..8 Data Exception Address Register */ #define SPR_ICMP 0x3d5 /* .68 Instruction TLB Compare Register */ #define SPR_PTEHI 0x3d5 /* .6. Instruction TLB Compare Register */ #define SPR_RPA 0x3d6 /* .68 Required Physical Address Register */ #define SPR_PTELO 0x3d6 /* .6. Required Physical Address Register */ #define SPR_TSR 0x150 /* ..8 Timer Status Register */ #define SPR_TCR 0x154 /* ..8 Timer Control Register */ #define TSR_ENW 0x80000000 /* Enable Next Watchdog */ #define TSR_WIS 0x40000000 /* Watchdog Interrupt Status */ #define TSR_WRS_MASK 0x30000000 /* Watchdog Reset Status */ #define TSR_WRS_NONE 0x00000000 /* No watchdog reset has occurred */ #define TSR_WRS_CORE 0x10000000 /* Core reset was forced by the watchdog */ #define TSR_WRS_CHIP 0x20000000 /* Chip reset was forced by the watchdog */ #define TSR_WRS_SYSTEM 0x30000000 /* System reset was forced by the watchdog */ #define TSR_PIS 0x08000000 /* PIT Interrupt Status */ #define TSR_DIS 0x08000000 /* Decrementer Interrupt Status */ #define TSR_FIS 0x04000000 /* FIT Interrupt Status */ #define TCR_WP_MASK 0xc0000000 /* Watchdog Period mask */ #define TCR_WP_2_17 0x00000000 /* 2**17 clocks */ #define TCR_WP_2_21 0x40000000 /* 2**21 clocks */ #define TCR_WP_2_25 0x80000000 /* 2**25 clocks */ #define TCR_WP_2_29 0xc0000000 /* 2**29 clocks */ #define TCR_WRC_MASK 0x30000000 /* Watchdog Reset Control mask */ #define TCR_WRC_NONE 0x00000000 /* No watchdog reset */ #define TCR_WRC_CORE 0x10000000 /* Core reset */ #define TCR_WRC_CHIP 0x20000000 /* Chip reset */ #define TCR_WRC_SYSTEM 0x30000000 /* System reset */ #define TCR_WIE 0x08000000 /* Watchdog Interrupt Enable */ #define TCR_PIE 0x04000000 /* PIT Interrupt Enable */ #define TCR_DIE 0x04000000 /* Pecrementer Interrupt Enable */ #define TCR_FP_MASK 0x03000000 /* FIT Period */ #define TCR_FP_2_9 0x00000000 /* 2**9 clocks */ #define TCR_FP_2_13 0x01000000 /* 2**13 clocks */ #define TCR_FP_2_17 0x02000000 /* 2**17 clocks */ #define TCR_FP_2_21 0x03000000 /* 2**21 clocks */ #define TCR_FIE 0x00800000 /* FIT Interrupt Enable */ #define TCR_ARE 0x00400000 /* Auto Reload Enable */ #define SPR_HID0 0x3f0 /* ..8 Hardware Implementation Register 0 */ #define SPR_HID1 0x3f1 /* ..8 Hardware Implementation Register 1 */ #define SPR_HID2 0x3f3 /* ..8 Hardware Implementation Register 2 */ #define SPR_HID4 0x3f4 /* ..8 Hardware Implementation Register 4 */ #define SPR_HID5 0x3f6 /* ..8 Hardware Implementation Register 5 */ #define SPR_HID6 0x3f9 /* ..8 Hardware Implementation Register 6 */ #define SPR_CELL_TSRL 0x380 /* ... Cell BE Thread Status Register */ #define SPR_CELL_TSCR 0x399 /* ... Cell BE Thread Switch Register */ #if defined(AIM) #define SPR_PIR 0x3ff /* .6. Processor Identification Register */ #elif defined(BOOKE) #define SPR_PIR 0x11e /* ..8 Processor Identification Register */ #endif #define DBCR0_EDM 0x80000000 /* External Debug Mode */ #define DBCR0_IDM 0x40000000 /* Internal Debug Mode */ #define DBCR0_RST_MASK 0x30000000 /* ReSeT */ #define DBCR0_RST_NONE 0x00000000 /* No action */ #define DBCR0_RST_CORE 0x10000000 /* Core reset */ #define DBCR0_RST_CHIP 0x20000000 /* Chip reset */ #define DBCR0_RST_SYSTEM 0x30000000 /* System reset */ #define DBCR0_IC 0x08000000 /* Instruction Completion debug event */ #define DBCR0_BT 0x04000000 /* Branch Taken debug event */ #define DBCR0_EDE 0x02000000 /* Exception Debug Event */ #define DBCR0_TDE 0x01000000 /* Trap Debug Event */ #define DBCR0_IA1 0x00800000 /* IAC (Instruction Address Compare) 1 debug event */ #define DBCR0_IA2 0x00400000 /* IAC 2 debug event */ #define DBCR0_IA12 0x00200000 /* Instruction Address Range Compare 1-2 */ #define DBCR0_IA12X 0x00100000 /* IA12 eXclusive */ #define DBCR0_IA3 0x00080000 /* IAC 3 debug event */ #define DBCR0_IA4 0x00040000 /* IAC 4 debug event */ #define DBCR0_IA34 0x00020000 /* Instruction Address Range Compare 3-4 */ #define DBCR0_IA34X 0x00010000 /* IA34 eXclusive */ #define DBCR0_IA12T 0x00008000 /* Instruction Address Range Compare 1-2 range Toggle */ #define DBCR0_IA34T 0x00004000 /* Instruction Address Range Compare 3-4 range Toggle */ #define DBCR0_FT 0x00000001 /* Freeze Timers on debug event */ #define SPR_IABR 0x3f2 /* ..8 Instruction Address Breakpoint Register 0 */ #define SPR_DABR 0x3f5 /* .6. Data Address Breakpoint Register */ #define SPR_MSSCR0 0x3f6 /* .6. Memory SubSystem Control Register */ #define MSSCR0_SHDEN 0x80000000 /* 0: Shared-state enable */ #define MSSCR0_SHDPEN3 0x40000000 /* 1: ~SHD[01] signal enable in MEI mode */ #define MSSCR0_L1INTVEN 0x38000000 /* 2-4: L1 data cache ~HIT intervention enable */ #define MSSCR0_L2INTVEN 0x07000000 /* 5-7: L2 data cache ~HIT intervention enable*/ #define MSSCR0_DL1HWF 0x00800000 /* 8: L1 data cache hardware flush */ #define MSSCR0_MBO 0x00400000 /* 9: must be one */ #define MSSCR0_EMODE 0x00200000 /* 10: MPX bus mode (read-only) */ #define MSSCR0_ABD 0x00100000 /* 11: address bus driven (read-only) */ #define MSSCR0_MBZ 0x000fffff /* 12-31: must be zero */ #define MSSCR0_L2PFE 0x00000003 /* 30-31: L2 prefetch enable */ #define SPR_MSSSR0 0x3f7 /* .6. Memory Subsystem Status Register (MPC745x) */ #define MSSSR0_L2TAG 0x00040000 /* 13: L2 tag parity error */ #define MSSSR0_L2DAT 0x00020000 /* 14: L2 data parity error */ #define MSSSR0_L3TAG 0x00010000 /* 15: L3 tag parity error */ #define MSSSR0_L3DAT 0x00008000 /* 16: L3 data parity error */ #define MSSSR0_APE 0x00004000 /* 17: Address parity error */ #define MSSSR0_DPE 0x00002000 /* 18: Data parity error */ #define MSSSR0_TEA 0x00001000 /* 19: Bus transfer error acknowledge */ #define SPR_LDSTCR 0x3f8 /* .6. Load/Store Control Register */ #define SPR_L2PM 0x3f8 /* .6. L2 Private Memory Control Register */ #define SPR_L2CR 0x3f9 /* .6. L2 Control Register */ #define L2CR_L2E 0x80000000 /* 0: L2 enable */ #define L2CR_L2PE 0x40000000 /* 1: L2 data parity enable */ #define L2CR_L2SIZ 0x30000000 /* 2-3: L2 size */ #define L2SIZ_2M 0x00000000 #define L2SIZ_256K 0x10000000 #define L2SIZ_512K 0x20000000 #define L2SIZ_1M 0x30000000 #define L2CR_L2CLK 0x0e000000 /* 4-6: L2 clock ratio */ #define L2CLK_DIS 0x00000000 /* disable L2 clock */ #define L2CLK_10 0x02000000 /* core clock / 1 */ #define L2CLK_15 0x04000000 /* / 1.5 */ #define L2CLK_20 0x08000000 /* / 2 */ #define L2CLK_25 0x0a000000 /* / 2.5 */ #define L2CLK_30 0x0c000000 /* / 3 */ #define L2CR_L2RAM 0x01800000 /* 7-8: L2 RAM type */ #define L2RAM_FLOWTHRU_BURST 0x00000000 #define L2RAM_PIPELINE_BURST 0x01000000 #define L2RAM_PIPELINE_LATE 0x01800000 #define L2CR_L2DO 0x00400000 /* 9: L2 data-only. Setting this bit disables instruction caching. */ #define L2CR_L2I 0x00200000 /* 10: L2 global invalidate. */ #define L2CR_L2IO_7450 0x00010000 /* 11: L2 instruction-only (MPC745x). */ #define L2CR_L2CTL 0x00100000 /* 11: L2 RAM control (ZZ enable). Enables automatic operation of the L2ZZ (low-power mode) signal. */ #define L2CR_L2WT 0x00080000 /* 12: L2 write-through. */ #define L2CR_L2TS 0x00040000 /* 13: L2 test support. */ #define L2CR_L2OH 0x00030000 /* 14-15: L2 output hold. */ #define L2CR_L2DO_7450 0x00010000 /* 15: L2 data-only (MPC745x). */ #define L2CR_L2SL 0x00008000 /* 16: L2 DLL slow. */ #define L2CR_L2DF 0x00004000 /* 17: L2 differential clock. */ #define L2CR_L2BYP 0x00002000 /* 18: L2 DLL bypass. */ #define L2CR_L2FA 0x00001000 /* 19: L2 flush assist (for software flush). */ #define L2CR_L2HWF 0x00000800 /* 20: L2 hardware flush. */ #define L2CR_L2IO 0x00000400 /* 21: L2 instruction-only. */ #define L2CR_L2CLKSTP 0x00000200 /* 22: L2 clock stop. */ #define L2CR_L2DRO 0x00000100 /* 23: L2DLL rollover checkstop enable. */ #define L2CR_L2IP 0x00000001 /* 31: L2 global invalidate in */ /* progress (read only). */ #define SPR_L3CR 0x3fa /* .6. L3 Control Register */ #define L3CR_L3E 0x80000000 /* 0: L3 enable */ #define L3CR_L3PE 0x40000000 /* 1: L3 data parity enable */ #define L3CR_L3APE 0x20000000 #define L3CR_L3SIZ 0x10000000 /* 3: L3 size (0=1MB, 1=2MB) */ #define L3CR_L3CLKEN 0x08000000 /* 4: Enables L3_CLK[0:1] */ #define L3CR_L3CLK 0x03800000 #define L3CR_L3IO 0x00400000 #define L3CR_L3CLKEXT 0x00200000 #define L3CR_L3CKSPEXT 0x00100000 #define L3CR_L3OH1 0x00080000 #define L3CR_L3SPO 0x00040000 #define L3CR_L3CKSP 0x00030000 #define L3CR_L3PSP 0x0000e000 #define L3CR_L3REP 0x00001000 #define L3CR_L3HWF 0x00000800 #define L3CR_L3I 0x00000400 /* 21: L3 global invalidate */ #define L3CR_L3RT 0x00000300 #define L3CR_L3NIRCA 0x00000080 #define L3CR_L3DO 0x00000040 #define L3CR_PMEN 0x00000004 #define L3CR_PMSIZ 0x00000003 #define SPR_THRM1 0x3fc /* .6. Thermal Management Register */ #define SPR_THRM2 0x3fd /* .6. Thermal Management Register */ #define SPR_THRM_TIN 0x80000000 /* Thermal interrupt bit (RO) */ #define SPR_THRM_TIV 0x40000000 /* Thermal interrupt valid (RO) */ #define SPR_THRM_THRESHOLD(x) ((x) << 23) /* Thermal sensor threshold */ #define SPR_THRM_TID 0x00000004 /* Thermal interrupt direction */ #define SPR_THRM_TIE 0x00000002 /* Thermal interrupt enable */ #define SPR_THRM_VALID 0x00000001 /* Valid bit */ #define SPR_THRM3 0x3fe /* .6. Thermal Management Register */ #define SPR_THRM_TIMER(x) ((x) << 1) /* Sampling interval timer */ #define SPR_THRM_ENABLE 0x00000001 /* TAU Enable */ #define SPR_FPECR 0x3fe /* .6. Floating-Point Exception Cause Register */ /* Time Base Register declarations */ #define TBR_TBL 0x10c /* 468 Time Base Lower - read */ #define TBR_TBU 0x10d /* 468 Time Base Upper - read */ #define TBR_TBWL 0x11c /* 468 Time Base Lower - supervisor, write */ #define TBR_TBWU 0x11d /* 468 Time Base Upper - supervisor, write */ /* Performance counter declarations */ #define PMC_OVERFLOW 0x80000000 /* Counter has overflowed */ /* The first five countable [non-]events are common to many PMC's */ #define PMCN_NONE 0 /* Count nothing */ #define PMCN_CYCLES 1 /* Processor cycles */ #define PMCN_ICOMP 2 /* Instructions completed */ #define PMCN_TBLTRANS 3 /* TBL bit transitions */ #define PCMN_IDISPATCH 4 /* Instructions dispatched */ /* Similar things for the 970 PMC direct counters */ #define PMC970N_NONE 0x8 /* Count nothing */ #define PMC970N_CYCLES 0xf /* Processor cycles */ #define PMC970N_ICOMP 0x9 /* Instructions completed */ #if defined(BOOKE) #define SPR_MCARU 0x239 /* ..8 Machine Check Address register upper bits */ #define SPR_MCSR 0x23c /* ..8 Machine Check Syndrome register */ #define MCSR_MCP 0x80000000 /* Machine check input signal to core */ #define MCSR_L2MMU_MHIT 0x08000000 /* L2 MMU simultaneous hit */ #define MCSR_NMI 0x00100000 /* Non-maskable interrupt */ #define MCSR_MAV 0x00080000 /* MCAR address valid */ #define MCSR_MEA 0x00040000 /* MCAR effective address */ #define MCSR_IF 0x00010000 /* Instruction fetch error report */ #define MCSR_LD 0x00008000 /* Load instruction error report */ #define MCSR_ST 0x00004000 /* Store instruction error report */ #define MCSR_LDG 0x00002000 /* Guarded load instruction error report */ #define MCSR_TLBSYNC 0x00000002 /* Simultaneous TLBSYNC detected */ #define SPR_MCAR 0x23d /* ..8 Machine Check Address register */ #define SPR_ESR 0x003e /* ..8 Exception Syndrome Register */ #define ESR_PIL 0x08000000 /* Program interrupt - illegal */ #define ESR_PPR 0x04000000 /* Program interrupt - privileged */ #define ESR_PTR 0x02000000 /* Program interrupt - trap */ #define ESR_ST 0x00800000 /* Store operation */ #define ESR_DLK 0x00200000 /* Data storage, D cache locking */ #define ESR_ILK 0x00100000 /* Data storage, I cache locking */ #define ESR_BO 0x00020000 /* Data/instruction storage, byte ordering */ #define ESR_SPE 0x00000080 /* SPE exception bit */ #define SPR_CSRR0 0x03a /* ..8 58 Critical SRR0 */ #define SPR_CSRR1 0x03b /* ..8 59 Critical SRR1 */ #define SPR_MCSRR0 0x23a /* ..8 570 Machine check SRR0 */ #define SPR_MCSRR1 0x23b /* ..8 571 Machine check SRR1 */ #define SPR_DSRR0 0x23e /* ..8 574 Debug SRR0 */ #define SPR_DSRR1 0x23f /* ..8 575 Debug SRR1 */ #define SPR_MMUCSR0 0x3f4 /* ..8 1012 MMU Control and Status Register 0 */ #define MMUCSR0_L2TLB0_FI 0x04 /* TLB0 flash invalidate */ #define MMUCSR0_L2TLB1_FI 0x02 /* TLB1 flash invalidate */ #define SPR_SVR 0x3ff /* ..8 1023 System Version Register */ #define SVR_MPC8533 0x8034 #define SVR_MPC8533E 0x803c #define SVR_MPC8541 0x8072 #define SVR_MPC8541E 0x807a #define SVR_MPC8548 0x8031 #define SVR_MPC8548E 0x8039 #define SVR_MPC8555 0x8071 #define SVR_MPC8555E 0x8079 #define SVR_MPC8572 0x80e0 #define SVR_MPC8572E 0x80e8 #define SVR_P1011 0x80e5 #define SVR_P1011E 0x80ed #define SVR_P1013 0x80e7 #define SVR_P1013E 0x80ef #define SVR_P1020 0x80e4 #define SVR_P1020E 0x80ec #define SVR_P1022 0x80e6 #define SVR_P1022E 0x80ee #define SVR_P2010 0x80e3 #define SVR_P2010E 0x80eb #define SVR_P2020 0x80e2 #define SVR_P2020E 0x80ea #define SVR_P2041 0x8210 #define SVR_P2041E 0x8218 #define SVR_P3041 0x8211 #define SVR_P3041E 0x8219 #define SVR_P4040 0x8200 #define SVR_P4040E 0x8208 #define SVR_P4080 0x8201 #define SVR_P4080E 0x8209 #define SVR_P5010 0x8221 #define SVR_P5010E 0x8229 #define SVR_P5020 0x8220 #define SVR_P5020E 0x8228 #define SVR_P5021 0x8205 #define SVR_P5021E 0x820d #define SVR_P5040 0x8204 #define SVR_P5040E 0x820c #define SVR_VER(svr) (((svr) >> 16) & 0xffff) #define SPR_PID0 0x030 /* ..8 Process ID Register 0 */ #define SPR_PID1 0x279 /* ..8 Process ID Register 1 */ #define SPR_PID2 0x27a /* ..8 Process ID Register 2 */ #define SPR_TLB0CFG 0x2B0 /* ..8 TLB 0 Config Register */ #define SPR_TLB1CFG 0x2B1 /* ..8 TLB 1 Config Register */ #define TLBCFG_ASSOC_MASK 0xff000000 /* Associativity of TLB */ #define TLBCFG_ASSOC_SHIFT 24 #define TLBCFG_NENTRY_MASK 0x00000fff /* Number of entries in TLB */ #define SPR_IVPR 0x03f /* ..8 Interrupt Vector Prefix Register */ #define SPR_IVOR0 0x190 /* ..8 Critical input */ #define SPR_IVOR1 0x191 /* ..8 Machine check */ #define SPR_IVOR2 0x192 #define SPR_IVOR3 0x193 #define SPR_IVOR4 0x194 #define SPR_IVOR5 0x195 #define SPR_IVOR6 0x196 #define SPR_IVOR7 0x197 #define SPR_IVOR8 0x198 #define SPR_IVOR9 0x199 #define SPR_IVOR10 0x19a #define SPR_IVOR11 0x19b #define SPR_IVOR12 0x19c #define SPR_IVOR13 0x19d #define SPR_IVOR14 0x19e #define SPR_IVOR15 0x19f #define SPR_IVOR32 0x210 #define SPR_IVOR33 0x211 #define SPR_IVOR34 0x212 #define SPR_IVOR35 0x213 #define SPR_MAS0 0x270 /* ..8 MMU Assist Register 0 Book-E/e500 */ #define SPR_MAS1 0x271 /* ..8 MMU Assist Register 1 Book-E/e500 */ #define SPR_MAS2 0x272 /* ..8 MMU Assist Register 2 Book-E/e500 */ #define SPR_MAS3 0x273 /* ..8 MMU Assist Register 3 Book-E/e500 */ #define SPR_MAS4 0x274 /* ..8 MMU Assist Register 4 Book-E/e500 */ #define SPR_MAS5 0x275 /* ..8 MMU Assist Register 5 Book-E */ #define SPR_MAS6 0x276 /* ..8 MMU Assist Register 6 Book-E/e500 */ #define SPR_MAS7 0x3B0 /* ..8 MMU Assist Register 7 Book-E/e500 */ #define SPR_MAS8 0x155 /* ..8 MMU Assist Register 8 Book-E/e500 */ #define SPR_L1CFG0 0x203 /* ..8 L1 cache configuration register 0 */ #define SPR_L1CFG1 0x204 /* ..8 L1 cache configuration register 1 */ #define SPR_CCR1 0x378 #define CCR1_L2COBE 0x00000040 #define DCR_L2DCDCRAI 0x0000 /* L2 D-Cache DCR Address Pointer */ #define DCR_L2DCDCRDI 0x0001 /* L2 D-Cache DCR Data Indirect */ #define DCR_L2CR0 0x00 /* L2 Cache Configuration Register 0 */ #define L2CR0_AS 0x30000000 #define SPR_L1CSR0 0x3F2 /* ..8 L1 Cache Control and Status Register 0 */ #define L1CSR0_DCPE 0x00010000 /* Data Cache Parity Enable */ #define L1CSR0_DCLFR 0x00000100 /* Data Cache Lock Bits Flash Reset */ #define L1CSR0_DCFI 0x00000002 /* Data Cache Flash Invalidate */ #define L1CSR0_DCE 0x00000001 /* Data Cache Enable */ #define SPR_L1CSR1 0x3F3 /* ..8 L1 Cache Control and Status Register 1 */ #define L1CSR1_ICPE 0x00010000 /* Instruction Cache Parity Enable */ #define L1CSR1_ICUL 0x00000400 /* Instr Cache Unable to Lock */ #define L1CSR1_ICLFR 0x00000100 /* Instruction Cache Lock Bits Flash Reset */ #define L1CSR1_ICFI 0x00000002 /* Instruction Cache Flash Invalidate */ #define L1CSR1_ICE 0x00000001 /* Instruction Cache Enable */ #define SPR_L2CSR0 0x3F9 /* ..8 L2 Cache Control and Status Register 0 */ #define L2CSR0_L2E 0x80000000 /* L2 Cache Enable */ #define L2CSR0_L2PE 0x40000000 /* L2 Cache Parity Enable */ #define L2CSR0_L2FI 0x00200000 /* L2 Cache Flash Invalidate */ #define L2CSR0_L2LFC 0x00000400 /* L2 Cache Lock Flags Clear */ #define SPR_BUCSR 0x3F5 /* ..8 Branch Unit Control and Status Register */ #define BUCSR_BPEN 0x00000001 /* Branch Prediction Enable */ #define BUCSR_BBFI 0x00000200 /* Branch Buffer Flash Invalidate */ #endif /* BOOKE */ #endif /* !_POWERPC_SPR_H_ */ Index: head/sys/powerpc/include/sr.h =================================================================== --- head/sys/powerpc/include/sr.h (revision 360886) +++ head/sys/powerpc/include/sr.h (revision 360887) @@ -1,64 +1,64 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2002 Benno Rice. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_SR_H_ #define _MACHINE_SR_H_ /* * Bit definitions for segment registers. * * PowerPC Microprocessor Family: The Programming Environments for 32-bit * Microprocessors, section 2.3.5 */ #define SR_TYPE 0x80000000 /* Type selector */ #define SR_KS 0x40000000 /* Supervisor-state protection key */ #define SR_KP 0x20000000 /* User-state protection key */ #define SR_N 0x10000000 /* No-execute protection */ #define SR_VSID_MASK 0x00ffffff /* Virtual Segment ID mask */ /* Kernel segment register usage */ #define USER_SR 12 #define KERNEL_SR 13 #define KERNEL2_SR 14 #define KERNEL3_SR 15 #define KERNEL_VSIDBITS 0xfffffUL #define KERNEL_SEGMENT (0xfffff0 + KERNEL_SR) #define KERNEL2_SEGMENT (0xfffff0 + KERNEL2_SR) #define EMPTY_SEGMENT 0xfffff0 #ifdef __powerpc64__ -#define USER_ADDR 0xeffffffff0000000UL +#define USER_ADDR 0xc00ffffff0000000UL #else #define USER_ADDR ((uintptr_t)USER_SR << ADDR_SR_SHFT) #endif #define SEGMENT_LENGTH 0x10000000UL #define SEGMENT_INVMASK 0x0fffffffUL #define SEGMENT_MASK ~SEGMENT_INVMASK #endif /* !_MACHINE_SR_H_ */ Index: head/sys/powerpc/include/vmparam.h =================================================================== --- head/sys/powerpc/include/vmparam.h (revision 360886) +++ head/sys/powerpc/include/vmparam.h (revision 360887) @@ -1,272 +1,318 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: vmparam.h,v 1.11 2000/02/11 19:25:16 thorpej Exp $ * $FreeBSD$ */ #ifndef _MACHINE_VMPARAM_H_ #define _MACHINE_VMPARAM_H_ #ifndef LOCORE #include #endif #define USRSTACK SHAREDPAGE #ifndef MAXTSIZ #define MAXTSIZ (1*1024*1024*1024) /* max text size */ #endif #ifndef DFLDSIZ #define DFLDSIZ (128*1024*1024) /* default data size */ #endif #ifndef MAXDSIZ #ifdef __powerpc64__ #define MAXDSIZ (32UL*1024*1024*1024) /* max data size */ #else #define MAXDSIZ (1*1024*1024*1024) /* max data size */ #endif #endif #ifndef DFLSSIZ #define DFLSSIZ (8*1024*1024) /* default stack size */ #endif #ifndef MAXSSIZ #ifdef __powerpc64__ #define MAXSSIZ (512*1024*1024) /* max stack size */ #else #define MAXSSIZ (64*1024*1024) /* max stack size */ #endif #endif #ifdef AIM #define VM_MAXUSER_ADDRESS32 0xfffff000 #else #define VM_MAXUSER_ADDRESS32 0x7ffff000 #endif /* * Would like to have MAX addresses = 0, but this doesn't (currently) work */ #ifdef __powerpc64__ +/* + * Virtual addresses of things. Derived from the page directory and + * page table indexes from pmap.h for precision. + * + * kernel map should be able to start at 0xc008000000000000 - + * but at least the functional simulator doesn't like it + * + * 0x0000000000000000 - 0x000fffffffffffff user map + * 0xc000000000000000 - 0xc007ffffffffffff direct map + * 0xc008000000000000 - 0xc00fffffffffffff kernel map + * + */ #define VM_MIN_ADDRESS 0x0000000000000000 -#ifdef BOOKE -#define VM_MAXUSER_ADDRESS 0x000ffffffffff000 -#else -#define VM_MAXUSER_ADDRESS 0x3ffffffffffff000 -#endif -#define VM_MAX_ADDRESS 0xffffffffffffffff -#define VM_MIN_KERNEL_ADDRESS 0xe000000000000000 -#define VM_MAX_KERNEL_ADDRESS 0xe0000007ffffffff +#define VM_MAXUSER_ADDRESS 0x000fffffc0000000 +#define VM_MAX_ADDRESS 0xc00fffffffffffff +#define VM_MIN_KERNEL_ADDRESS 0xc008000000000000 +#define VM_MAX_KERNEL_ADDRESS 0xc0080007ffffffff #define VM_MAX_SAFE_KERNEL_ADDRESS VM_MAX_KERNEL_ADDRESS #else #define VM_MIN_ADDRESS 0 #define VM_MAXUSER_ADDRESS VM_MAXUSER_ADDRESS32 #define VM_MAX_ADDRESS 0xffffffff #endif #define SHAREDPAGE (VM_MAXUSER_ADDRESS - PAGE_SIZE) #define FREEBSD32_SHAREDPAGE (VM_MAXUSER_ADDRESS32 - PAGE_SIZE) #define FREEBSD32_USRSTACK FREEBSD32_SHAREDPAGE #define KERNBASE 0x00100100 /* start of kernel virtual */ #ifdef AIM #ifndef __powerpc64__ #define VM_MIN_KERNEL_ADDRESS ((vm_offset_t)KERNEL_SR << ADDR_SR_SHFT) #define VM_MAX_SAFE_KERNEL_ADDRESS (VM_MIN_KERNEL_ADDRESS + 2*SEGMENT_LENGTH -1) #define VM_MAX_KERNEL_ADDRESS (VM_MIN_KERNEL_ADDRESS + 3*SEGMENT_LENGTH - 1) #endif /* * Use the direct-mapped BAT registers for UMA small allocs. This * takes pressure off the small amount of available KVA. */ #define UMA_MD_SMALL_ALLOC #else /* Book-E */ /* Use the direct map for UMA small allocs on powerpc64. */ #ifdef __powerpc64__ #define UMA_MD_SMALL_ALLOC #else #define VM_MIN_KERNEL_ADDRESS 0xc0000000 #define VM_MAX_KERNEL_ADDRESS 0xffffefff #define VM_MAX_SAFE_KERNEL_ADDRESS VM_MAX_KERNEL_ADDRESS #endif #endif /* AIM/E500 */ #if !defined(LOCORE) struct pmap_physseg { struct pv_entry *pvent; char *attrs; }; #endif -#define VM_PHYSSEG_MAX 16 +#ifdef __powerpc64__ +#define VM_PHYSSEG_MAX 63 /* 1? */ +#else +#define VM_PHYSSEG_MAX 16 /* 1? */ +#endif #define PHYS_AVAIL_SZ 256 /* Allows up to 16GB Ram on pSeries with * logical memory block size of 64MB. * For more Ram increase the lmb or this value. */ /* XXX This is non-sensical. Phys avail should hold contiguous regions. */ #define PHYS_AVAIL_ENTRIES PHYS_AVAIL_SZ /* * The physical address space is densely populated on 32-bit systems, * but may not be on 64-bit ones. */ #ifdef __powerpc64__ #define VM_PHYSSEG_SPARSE #else #define VM_PHYSSEG_DENSE #endif /* * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool * from which physical pages are allocated and VM_FREEPOOL_DIRECT is * the pool from which physical pages for small UMA objects are * allocated. */ #define VM_NFREEPOOL 2 #define VM_FREEPOOL_DEFAULT 0 #define VM_FREEPOOL_DIRECT 1 /* * Create one free page list. */ #define VM_NFREELIST 1 #define VM_FREELIST_DEFAULT 0 /* * The largest allocation size is 4MB. */ +#ifdef __powerpc64__ +#define VM_NFREEORDER 13 +#else #define VM_NFREEORDER 11 +#endif +#ifndef VM_NRESERVLEVEL +#ifdef __powerpc64__ +#define VM_NRESERVLEVEL 1 +#else /* * Disable superpage reservations. */ -#ifndef VM_NRESERVLEVEL #define VM_NRESERVLEVEL 0 #endif +#endif +/* + * Level 0 reservations consist of 512 pages. + */ +#ifndef VM_LEVEL_0_ORDER +#define VM_LEVEL_0_ORDER 9 +#endif + +#ifdef __powerpc64__ +#ifdef SMP +#define PA_LOCK_COUNT 256 +#endif +#endif + #ifndef VM_INITIAL_PAGEIN #define VM_INITIAL_PAGEIN 16 #endif #ifndef SGROWSIZ #define SGROWSIZ (128UL*1024) /* amount to grow stack */ #endif /* * How many physical pages per kmem arena virtual page. */ #ifndef VM_KMEM_SIZE_SCALE #define VM_KMEM_SIZE_SCALE (3) #endif /* * Optional floor (in bytes) on the size of the kmem arena. */ #ifndef VM_KMEM_SIZE_MIN #define VM_KMEM_SIZE_MIN (12 * 1024 * 1024) #endif /* * Optional ceiling (in bytes) on the size of the kmem arena: 40% of the * usable KVA space. */ #ifndef VM_KMEM_SIZE_MAX #define VM_KMEM_SIZE_MAX ((VM_MAX_SAFE_KERNEL_ADDRESS - \ VM_MIN_KERNEL_ADDRESS + 1) * 2 / 5) #endif +#ifdef __powerpc64__ +#define ZERO_REGION_SIZE (2 * 1024 * 1024) /* 2MB */ +#else #define ZERO_REGION_SIZE (64 * 1024) /* 64KB */ +#endif /* + * Use a fairly large batch size since we expect ppc64 systems to have lots of + * memory. + */ +#ifdef __powerpc64__ +#define VM_BATCHQUEUE_SIZE 31 +#endif + +/* * On 32-bit OEA, the only purpose for which sf_buf is used is to implement * an opaque pointer required by the machine-independent parts of the kernel. * That pointer references the vm_page that is "mapped" by the sf_buf. The * actual mapping is provided by the direct virtual-to-physical mapping. * * On OEA64 and Book-E, we need to do something a little more complicated. Use * the runtime-detected hw_direct_map to pick between the two cases. Our * friends in vm_machdep.c will do the same to ensure nothing gets confused. */ #define SFBUF #define SFBUF_NOMD /* * We (usually) have a direct map of all physical memory, so provide * a macro to use to get the kernel VA address for a given PA. Check the * value of PMAP_HAS_PMAP before using. */ #ifndef LOCORE #ifdef __powerpc64__ #define DMAP_BASE_ADDRESS 0xc000000000000000UL -#define DMAP_MAX_ADDRESS 0xcfffffffffffffffUL +#define DMAP_MIN_ADDRESS DMAP_BASE_ADDRESS +#define DMAP_MAX_ADDRESS 0xc007ffffffffffffUL #else #define DMAP_BASE_ADDRESS 0x00000000UL #define DMAP_MAX_ADDRESS 0xbfffffffUL #endif #endif #if defined(__powerpc64__) || defined(BOOKE) /* * powerpc64 and Book-E will provide their own page array allocators. * * On AIM, this will allocate a single virtual array, with pages from the * correct memory domains. * On Book-E this will let us put the array in TLB1, removing the need for TLB * thrashing. * * VM_MIN_KERNEL_ADDRESS is just a dummy. It will get set by the MMU driver. */ #define PA_MIN_ADDRESS VM_MIN_KERNEL_ADDRESS #define PMAP_HAS_PAGE_ARRAY 1 #endif #define PMAP_HAS_DMAP (hw_direct_map) #define PHYS_TO_DMAP(x) ({ \ KASSERT(hw_direct_map, ("Direct map not provided by PMAP")); \ (x) | DMAP_BASE_ADDRESS; }) #define DMAP_TO_PHYS(x) ({ \ KASSERT(hw_direct_map, ("Direct map not provided by PMAP")); \ (x) &~ DMAP_BASE_ADDRESS; }) #endif /* _MACHINE_VMPARAM_H_ */ Index: head/sys/powerpc/powerpc/machdep.c =================================================================== --- head/sys/powerpc/powerpc/machdep.c (revision 360886) +++ head/sys/powerpc/powerpc/machdep.c (revision 360887) @@ -1,884 +1,884 @@ /*- * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (C) 2001 Benno Rice * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * $NetBSD: machdep.c,v 1.74.2.1 2000/11/01 16:13:48 tv Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kstack_pages.h" #include "opt_platform.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef __powerpc64__ #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int cold = 1; #ifdef __powerpc64__ int cacheline_size = 128; #else int cacheline_size = 32; #endif int hw_direct_map = 1; #ifdef BOOKE extern vm_paddr_t kernload; #endif extern void *ap_pcpu; -struct pcpu __pcpu[MAXCPU]; +struct pcpu __pcpu[MAXCPU] __aligned(PAGE_SIZE); static char init_kenv[2048]; static struct trapframe frame0; char machine[] = "powerpc"; SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, ""); static void cpu_startup(void *); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); SYSCTL_INT(_machdep, CPU_CACHELINE, cacheline_size, CTLFLAG_RD, &cacheline_size, 0, ""); uintptr_t powerpc_init(vm_offset_t, vm_offset_t, vm_offset_t, void *, uint32_t); static void fake_preload_metadata(void); long Maxmem = 0; long realmem = 0; /* Default MSR values set in the AIM/Book-E early startup code */ register_t psl_kernset; register_t psl_userset; register_t psl_userstatic; #ifdef __powerpc64__ register_t psl_userset32; #endif struct kva_md_info kmi; static void cpu_startup(void *dummy) { /* * Initialise the decrementer-based clock. */ decr_init(); /* * Good {morning,afternoon,evening,night}. */ cpu_setup(PCPU_GET(cpuid)); #ifdef PERFMON perfmon_init(); #endif printf("real memory = %ju (%ju MB)\n", ptoa((uintmax_t)physmem), ptoa((uintmax_t)physmem) / 1048576); realmem = physmem; if (bootverbose) printf("available KVA = %zu (%zu MB)\n", virtual_end - virtual_avail, (virtual_end - virtual_avail) / 1048576); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size1 = phys_avail[indx + 1] - phys_avail[indx]; #ifdef __powerpc64__ printf("0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", #else printf("0x%09jx - 0x%09jx, %ju bytes (%ju pages)\n", #endif (uintmax_t)phys_avail[indx], (uintmax_t)phys_avail[indx + 1] - 1, (uintmax_t)size1, (uintmax_t)size1 / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", ptoa((uintmax_t)vm_free_count()), ptoa((uintmax_t)vm_free_count()) / 1048576); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); } extern vm_offset_t __startkernel, __endkernel; extern unsigned char __bss_start[]; extern unsigned char __sbss_start[]; extern unsigned char __sbss_end[]; extern unsigned char _end[]; void aim_early_init(vm_offset_t fdt, vm_offset_t toc, vm_offset_t ofentry, void *mdp, uint32_t mdp_cookie); void aim_cpu_init(vm_offset_t toc); void booke_cpu_init(void); #ifdef DDB static void load_external_symtab(void); static void displace_symbol_table(vm_offset_t, vm_offset_t, vm_offset_t); #endif uintptr_t powerpc_init(vm_offset_t fdt, vm_offset_t toc, vm_offset_t ofentry, void *mdp, uint32_t mdp_cookie) { struct pcpu *pc; struct cpuref bsp; vm_offset_t startkernel, endkernel; char *env; void *kmdp = NULL; bool ofw_bootargs = false; bool symbols_provided = false; #ifdef DDB vm_offset_t ksym_start; vm_offset_t ksym_end; vm_offset_t ksym_sz; #endif /* First guess at start/end kernel positions */ startkernel = __startkernel; endkernel = __endkernel; /* * If the metadata pointer cookie is not set to the magic value, * the number in mdp should be treated as nonsense. */ if (mdp_cookie != 0xfb5d104d) mdp = NULL; #if !defined(BOOKE) /* * On BOOKE the BSS is already cleared and some variables * initialized. Do not wipe them out. */ bzero(__sbss_start, __sbss_end - __sbss_start); bzero(__bss_start, _end - __bss_start); #endif cpu_feature_setup(); #ifdef AIM aim_early_init(fdt, toc, ofentry, mdp, mdp_cookie); #endif /* * At this point, we are executing in our correct memory space. * Book-E started there, and AIM has done an rfi and restarted * execution from _start. * * We may still be in real mode, however. If we are running out of * the direct map on 64 bit, this is possible to do. */ /* * Parse metadata if present and fetch parameters. Must be done * before console is inited so cninit gets the right value of * boothowto. */ if (mdp != NULL) { /* * Starting up from loader. * * Full metadata has been provided, but we need to figure * out the correct address to relocate it to. */ char *envp = NULL; uintptr_t md_offset = 0; vm_paddr_t kernelstartphys, kernelendphys; #ifdef AIM if ((uintptr_t)&powerpc_init > DMAP_BASE_ADDRESS) md_offset = DMAP_BASE_ADDRESS; #else /* BOOKE */ md_offset = VM_MIN_KERNEL_ADDRESS - kernload; #endif preload_metadata = mdp; if (md_offset > 0) { /* Translate phys offset into DMAP offset. */ preload_metadata += md_offset; preload_bootstrap_relocate(md_offset); } kmdp = preload_search_by_type("elf kernel"); if (kmdp != NULL) { boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); if (envp != NULL) envp += md_offset; init_static_kenv(envp, 0); if (fdt == 0) { fdt = MD_FETCH(kmdp, MODINFOMD_DTBP, uintptr_t); if (fdt != 0) fdt += md_offset; } kernelstartphys = MD_FETCH(kmdp, MODINFO_ADDR, vm_offset_t); /* kernelstartphys is already relocated. */ kernelendphys = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t); if (kernelendphys != 0) kernelendphys += md_offset; endkernel = ulmax(endkernel, kernelendphys); #ifdef DDB ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); ksym_sz = *(Elf_Size*)ksym_start; /* * Loader already handled displacing to the load * address, but we still need to displace it to the * DMAP. */ displace_symbol_table( (vm_offset_t)(ksym_start + sizeof(Elf_Size)), ksym_sz, md_offset); db_fetch_ksymtab(ksym_start, ksym_end); symbols_provided = true; #endif } } else { /* * Self-loading kernel, we have to fake up metadata. * * Since we are creating the metadata from the final * memory space, we don't need to call * preload_boostrap_relocate(). */ fake_preload_metadata(); kmdp = preload_search_by_type("elf kernel"); init_static_kenv(init_kenv, sizeof(init_kenv)); ofw_bootargs = true; } /* Store boot environment state */ OF_initial_setup((void *)fdt, NULL, (int (*)(void *))ofentry); /* * Init params/tunables that can be overridden by the loader */ init_param1(); /* * Start initializing proc0 and thread0. */ proc_linkup0(&proc0, &thread0); thread0.td_frame = &frame0; #ifdef __powerpc64__ __asm __volatile("mr 13,%0" :: "r"(&thread0)); #else __asm __volatile("mr 2,%0" :: "r"(&thread0)); #endif /* * Init mutexes, which we use heavily in PMAP */ mutex_init(); /* * Install the OF client interface */ OF_bootstrap(); #ifdef DDB if (!symbols_provided && hw_direct_map) load_external_symtab(); #endif if (ofw_bootargs) ofw_parse_bootargs(); /* * Initialize the console before printing anything. */ cninit(); #ifdef AIM aim_cpu_init(toc); #else /* BOOKE */ booke_cpu_init(); /* Make sure the kernel icache is valid before we go too much further */ __syncicache((caddr_t)startkernel, endkernel - startkernel); #endif /* * Choose a platform module so we can get the physical memory map. */ platform_probe_and_attach(); /* * Set up per-cpu data for the BSP now that the platform can tell * us which that is. */ if (platform_smp_get_bsp(&bsp) != 0) bsp.cr_cpuid = 0; pc = &__pcpu[bsp.cr_cpuid]; __asm __volatile("mtsprg 0, %0" :: "r"(pc)); pcpu_init(pc, bsp.cr_cpuid, sizeof(struct pcpu)); pc->pc_curthread = &thread0; thread0.td_oncpu = bsp.cr_cpuid; pc->pc_cpuid = bsp.cr_cpuid; pc->pc_hwref = bsp.cr_hwref; /* * Init KDB */ kdb_init(); /* * Bring up MMU */ pmap_bootstrap(startkernel, endkernel); mtmsr(psl_kernset & ~PSL_EE); link_elf_ireloc(kmdp); /* * Initialize params/tunables that are derived from memsize */ init_param2(physmem); /* * Grab booted kernel's name */ env = kern_getenv("kernelname"); if (env != NULL) { strlcpy(kernelname, env, sizeof(kernelname)); freeenv(env); } /* * Finish setting up thread0. */ thread0.td_pcb = (struct pcb *) ((thread0.td_kstack + thread0.td_kstack_pages * PAGE_SIZE - sizeof(struct pcb)) & ~15UL); bzero((void *)thread0.td_pcb, sizeof(struct pcb)); pc->pc_curpcb = thread0.td_pcb; /* Initialise the message buffer. */ msgbufinit(msgbufp, msgbufsize); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif return (((uintptr_t)thread0.td_pcb - (sizeof(struct callframe) - 3*sizeof(register_t))) & ~15UL); } #ifdef DDB /* * XXX Figure out where to move this. */ static void displace_symbol_table(vm_offset_t ksym_start, vm_offset_t ksym_sz, vm_offset_t displacement) { Elf_Sym *sym; /* * Relocate the symbol table to our final load address. */ for (sym = (Elf_Sym *)ksym_start; (vm_paddr_t)sym < (ksym_start + ksym_sz); sym++) { if (sym->st_name == 0 || sym->st_shndx == SHN_UNDEF || sym->st_value == 0) continue; if (ELF_ST_TYPE(sym->st_info) != STT_OBJECT && ELF_ST_TYPE(sym->st_info) != STT_FUNC && ELF_ST_TYPE(sym->st_info) != STT_NOTYPE) continue; /* Skip relocating any implausible symbols */ if (sym->st_value > KERNBASE) sym->st_value += displacement; } } /* * On powernv, we might not have symbols loaded via loader. However, if the * user passed the kernel in as the initrd as well, we can manually load it * via reinterpreting the initrd copy of the kernel. */ static void load_external_symtab(void) { phandle_t chosen; vm_paddr_t start, end; pcell_t cell[2]; ssize_t size; u_char *kernelimg; int i; Elf_Ehdr *ehdr; Elf_Phdr *phdr; Elf_Shdr *shdr; vm_offset_t ksym_start, ksym_sz, kstr_start, kstr_sz; if (!hw_direct_map) return; chosen = OF_finddevice("/chosen"); if (chosen <= 0) return; if (!OF_hasprop(chosen, "linux,initrd-start") || !OF_hasprop(chosen, "linux,initrd-end")) return; size = OF_getencprop(chosen, "linux,initrd-start", cell, sizeof(cell)); if (size == 4) start = cell[0]; else if (size == 8) start = (uint64_t)cell[0] << 32 | cell[1]; else return; size = OF_getencprop(chosen, "linux,initrd-end", cell, sizeof(cell)); if (size == 4) end = cell[0]; else if (size == 8) end = (uint64_t)cell[0] << 32 | cell[1]; else return; if (!(end - start > 0)) return; kernelimg = (u_char *) PHYS_TO_DMAP(start); ehdr = (Elf_Ehdr *)kernelimg; if (!IS_ELF(*ehdr)) return; phdr = (Elf_Phdr *)(kernelimg + ehdr->e_phoff); shdr = (Elf_Shdr *)(kernelimg + ehdr->e_shoff); ksym_start = 0; ksym_sz = 0; kstr_start = 0; kstr_sz = 0; for (i = 0; i < ehdr->e_shnum; i++) { if (shdr[i].sh_type == SHT_SYMTAB) { ksym_start = (vm_offset_t)(kernelimg + shdr[i].sh_offset); ksym_sz = (vm_offset_t)(shdr[i].sh_size); kstr_start = (vm_offset_t)(kernelimg + shdr[shdr[i].sh_link].sh_offset); kstr_sz = (vm_offset_t) (shdr[shdr[i].sh_link].sh_size); } } if (ksym_start != 0 && kstr_start != 0 && ksym_sz != 0 && kstr_sz != 0 && ksym_start < kstr_start) { displace_symbol_table(ksym_start, ksym_sz, (__startkernel - KERNBASE)); ksymtab = ksym_start; ksymtab_size = ksym_sz; kstrtab = kstr_start; } }; #endif /* * When not being loaded from loader, we need to create our own metadata * so we can interact with the kernel linker. */ static void fake_preload_metadata(void) { /* We depend on dword alignment here. */ static uint32_t fake_preload[36] __aligned(8); int i = 0; fake_preload[i++] = MODINFO_NAME; fake_preload[i++] = strlen("kernel") + 1; strcpy((char*)&fake_preload[i], "kernel"); /* ['k' 'e' 'r' 'n'] ['e' 'l' '\0' ..] */ i += 2; fake_preload[i++] = MODINFO_TYPE; fake_preload[i++] = strlen("elf kernel") + 1; strcpy((char*)&fake_preload[i], "elf kernel"); /* ['e' 'l' 'f' ' '] ['k' 'e' 'r' 'n'] ['e' 'l' '\0' ..] */ i += 3; #ifdef __powerpc64__ /* Padding -- Fields start on u_long boundaries */ fake_preload[i++] = 0; #endif fake_preload[i++] = MODINFO_ADDR; fake_preload[i++] = sizeof(vm_offset_t); *(vm_offset_t *)&fake_preload[i] = (vm_offset_t)(__startkernel); i += (sizeof(vm_offset_t) / 4); fake_preload[i++] = MODINFO_SIZE; fake_preload[i++] = sizeof(vm_offset_t); *(vm_offset_t *)&fake_preload[i] = (vm_offset_t)(__endkernel) - (vm_offset_t)(__startkernel); i += (sizeof(vm_offset_t) / 4); /* * MODINFOMD_SSYM and MODINFOMD_ESYM cannot be provided here, * as the memory comes from outside the loaded ELF sections. * * If the symbols are being provided by other means (MFS), the * tables will be loaded into the debugger directly. */ /* Null field at end to mark end of data. */ fake_preload[i++] = 0; fake_preload[i] = 0; preload_metadata = (void*)fake_preload; } /* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { register_t addr, off; /* * Align the address to a cacheline and adjust the length * accordingly. Then round the length to a multiple of the * cacheline for easy looping. */ addr = (uintptr_t)ptr; off = addr & (cacheline_size - 1); addr -= off; len = roundup2(len + off, cacheline_size); while (len > 0) { __asm __volatile ("dcbf 0,%0" :: "r"(addr)); __asm __volatile ("sync"); addr += cacheline_size; len -= cacheline_size; } } int ptrace_set_pc(struct thread *td, unsigned long addr) { struct trapframe *tf; tf = td->td_frame; tf->srr0 = (register_t)addr; return (0); } void spinlock_enter(void) { struct thread *td; register_t msr; td = curthread; if (td->td_md.md_spinlock_count == 0) { nop_prio_mhigh(); msr = intr_disable(); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_msr = msr; critical_enter(); } else td->td_md.md_spinlock_count++; } void spinlock_exit(void) { struct thread *td; register_t msr; td = curthread; msr = td->td_md.md_saved_msr; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) { critical_exit(); intr_restore(msr); nop_prio_medium(); } } /* * Simple ddb(4) command/hack to view any SPR on the running CPU. * Uses a trivial asm function to perform the mfspr, and rewrites the mfspr * instruction each time. * XXX: Since it uses code modification, it won't work if the kernel code pages * are marked RO. */ extern register_t get_spr(int); #ifdef DDB DB_SHOW_COMMAND(spr, db_show_spr) { register_t spr; volatile uint32_t *p; int sprno, saved_sprno; if (!have_addr) return; saved_sprno = sprno = (intptr_t) addr; sprno = ((sprno & 0x3e0) >> 5) | ((sprno & 0x1f) << 5); p = (uint32_t *)(void *)&get_spr; #ifdef __powerpc64__ #if defined(_CALL_ELF) && _CALL_ELF == 2 /* Account for ELFv2 function prologue. */ p += 2; #else p = *(volatile uint32_t * volatile *)p; #endif #endif *p = (*p & ~0x001ff800) | (sprno << 11); __syncicache(__DEVOLATILE(uint32_t *, p), cacheline_size); spr = get_spr(sprno); db_printf("SPR %d(%x): %lx\n", saved_sprno, saved_sprno, (unsigned long)spr); } DB_SHOW_COMMAND(frame, db_show_frame) { struct trapframe *tf; long reg; int i; tf = have_addr ? (struct trapframe *)addr : curthread->td_frame; /* * Everything casts through long to simplify the printing. * 'long' is native register size anyway. */ db_printf("trap frame %p\n", tf); for (i = 0; i < nitems(tf->fixreg); i++) { reg = tf->fixreg[i]; db_printf(" r%d:\t%#lx (%ld)\n", i, reg, reg); } reg = tf->lr; db_printf(" lr:\t%#lx\n", reg); reg = tf->cr; db_printf(" cr:\t%#lx\n", reg); reg = tf->xer; db_printf(" xer:\t%#lx\n", reg); reg = tf->ctr; db_printf(" ctr:\t%#lx (%ld)\n", reg, reg); reg = tf->srr0; db_printf(" srr0:\t%#lx\n", reg); reg = tf->srr1; db_printf(" srr1:\t%#lx\n", reg); reg = tf->exc; db_printf(" exc:\t%#lx\n", reg); reg = tf->dar; db_printf(" dar:\t%#lx\n", reg); #ifdef AIM reg = tf->cpu.aim.dsisr; db_printf(" dsisr:\t%#lx\n", reg); #else reg = tf->cpu.booke.esr; db_printf(" esr:\t%#lx\n", reg); reg = tf->cpu.booke.dbcr0; db_printf(" dbcr0:\t%#lx\n", reg); #endif } #endif #undef bzero void bzero(void *buf, size_t len) { caddr_t p; p = buf; while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) { *p++ = 0; len--; } while (len >= sizeof(u_long) * 8) { *(u_long*) p = 0; *((u_long*) p + 1) = 0; *((u_long*) p + 2) = 0; *((u_long*) p + 3) = 0; len -= sizeof(u_long) * 8; *((u_long*) p + 4) = 0; *((u_long*) p + 5) = 0; *((u_long*) p + 6) = 0; *((u_long*) p + 7) = 0; p += sizeof(u_long) * 8; } while (len >= sizeof(u_long)) { *(u_long*) p = 0; len -= sizeof(u_long); p += sizeof(u_long); } while (len) { *p++ = 0; len--; } } /* __stack_chk_fail_local() is called in secure-plt (32-bit). */ #if !defined(__powerpc64__) extern void __stack_chk_fail(void); void __stack_chk_fail_local(void); void __stack_chk_fail_local(void) { __stack_chk_fail(); } #endif Index: head/sys/powerpc/powerpc/mmu_if.m =================================================================== --- head/sys/powerpc/powerpc/mmu_if.m (revision 360886) +++ head/sys/powerpc/powerpc/mmu_if.m (revision 360887) @@ -1,1087 +1,1102 @@ #- # Copyright (c) 2005 Peter Grehan # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # $FreeBSD$ # #include #include #include #include #include #include #include /** * @defgroup MMU mmu - KObj methods for PowerPC MMU implementations * @brief A set of methods required by all MMU implementations. These * are basically direct call-thru's from the pmap machine-dependent * code. * Thanks to Bruce M Simpson's pmap man pages for routine descriptions. *@{ */ INTERFACE mmu; SINGLETON; # # Default implementations of some methods # CODE { static void mmu_null_copy(mmu_t mmu, pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { return; } static void mmu_null_growkernel(mmu_t mmu, vm_offset_t addr) { return; } static void mmu_null_init(mmu_t mmu) { return; } static boolean_t mmu_null_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t va) { return (FALSE); } static void mmu_null_object_init_pt(mmu_t mmu, pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t index, vm_size_t size) { return; } static void mmu_null_page_init(mmu_t mmu, vm_page_t m) { return; } static void mmu_null_remove_pages(mmu_t mmu, pmap_t pmap) { return; } static int mmu_null_mincore(mmu_t mmu, pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { return (0); } static void mmu_null_deactivate(struct thread *td) { return; } static void mmu_null_align_superpage(mmu_t mmu, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { return; } static void *mmu_null_mapdev_attr(mmu_t mmu, vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { return MMU_MAPDEV(mmu, pa, size); } static void mmu_null_kenter_attr(mmu_t mmu, vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { MMU_KENTER(mmu, va, pa); } static void mmu_null_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma) { return; } static int mmu_null_change_attr(mmu_t mmu, vm_offset_t va, vm_size_t sz, vm_memattr_t mode) { return (0); } static size_t mmu_null_scan_pmap(mmu_t mmu) { return (0); } static void *mmu_null_dump_pmap_init(mmu_t mmu, unsigned blkpgs) { return (NULL); } static void * mmu_null_dump_pmap(mmu_t mmu, void *ctx, void *buf, u_long *nbytes) { return (NULL); } + + static boolean_t mmu_null_ps_enabled(mmu_t mmu) + { + return (FALSE); + } }; /** * @brief Apply the given advice to the specified range of addresses within * the given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. * * @param _pmap physical map * @param _start virtual range start * @param _end virtual range end * @param _advice advice to apply */ METHOD void advise { mmu_t _mmu; pmap_t _pmap; vm_offset_t _start; vm_offset_t _end; int _advice; }; /** * @brief Clear the 'modified' bit on the given physical page * * @param _pg physical page */ METHOD void clear_modify { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Clear the write and modified bits in each of the given * physical page's mappings * * @param _pg physical page */ METHOD void remove_write { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Copy the address range given by the source physical map, virtual * address and length to the destination physical map and virtual address. * This routine is optional (xxx default null implementation ?) * * @param _dst_pmap destination physical map * @param _src_pmap source physical map * @param _dst_addr destination virtual address * @param _len size of range * @param _src_addr source virtual address */ METHOD void copy { mmu_t _mmu; pmap_t _dst_pmap; pmap_t _src_pmap; vm_offset_t _dst_addr; vm_size_t _len; vm_offset_t _src_addr; } DEFAULT mmu_null_copy; /** * @brief Copy the source physical page to the destination physical page * * @param _src source physical page * @param _dst destination physical page */ METHOD void copy_page { mmu_t _mmu; vm_page_t _src; vm_page_t _dst; }; METHOD void copy_pages { mmu_t _mmu; vm_page_t *_ma; vm_offset_t _a_offset; vm_page_t *_mb; vm_offset_t _b_offset; int _xfersize; }; /** * @brief Create a mapping between a virtual/physical address pair in the * passed physical map with the specified protection and wiring * * @param _pmap physical map * @param _va mapping virtual address * @param _p mapping physical page * @param _prot mapping page protection * @param _flags pmap_enter flags * @param _psind superpage size index */ METHOD int enter { mmu_t _mmu; pmap_t _pmap; vm_offset_t _va; vm_page_t _p; vm_prot_t _prot; u_int _flags; int8_t _psind; }; /** * @brief Maps a sequence of resident pages belonging to the same object. * * @param _pmap physical map * @param _start virtual range start * @param _end virtual range end * @param _m_start physical page mapped at start * @param _prot mapping page protection */ METHOD void enter_object { mmu_t _mmu; pmap_t _pmap; vm_offset_t _start; vm_offset_t _end; vm_page_t _m_start; vm_prot_t _prot; }; /** * @brief A faster entry point for page mapping where it is possible * to short-circuit some of the tests in pmap_enter. * * @param _pmap physical map (and also currently active pmap) * @param _va mapping virtual address * @param _pg mapping physical page * @param _prot new page protection - used to see if page is exec. */ METHOD void enter_quick { mmu_t _mmu; pmap_t _pmap; vm_offset_t _va; vm_page_t _pg; vm_prot_t _prot; }; /** * @brief Reverse map the given virtual address, returning the physical * page associated with the address if a mapping exists. * * @param _pmap physical map * @param _va mapping virtual address * * @retval 0 No mapping found * @retval addr The mapping physical address */ METHOD vm_paddr_t extract { mmu_t _mmu; pmap_t _pmap; vm_offset_t _va; }; /** * @brief Reverse map the given virtual address, returning the * physical page if found. The page must be held (by calling * vm_page_hold) if the page protection matches the given protection * * @param _pmap physical map * @param _va mapping virtual address * @param _prot protection used to determine if physical page * should be locked * * @retval NULL No mapping found * @retval page Pointer to physical page. Held if protections match */ METHOD vm_page_t extract_and_hold { mmu_t _mmu; pmap_t _pmap; vm_offset_t _va; vm_prot_t _prot; }; /** * @brief Increase kernel virtual address space to the given virtual address. * Not really required for PowerPC, so optional unless the MMU implementation * can use it. * * @param _va new upper limit for kernel virtual address space */ METHOD void growkernel { mmu_t _mmu; vm_offset_t _va; } DEFAULT mmu_null_growkernel; /** * @brief Called from vm_mem_init. Zone allocation is available at * this stage so a convenient time to create zones. This routine is * for MMU-implementation convenience and is optional. */ METHOD void init { mmu_t _mmu; } DEFAULT mmu_null_init; /** * @brief Return if the page has been marked by MMU hardware to have been * modified * * @param _pg physical page to test * * @retval boolean TRUE if page has been modified */ METHOD boolean_t is_modified { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Return whether the specified virtual address is a candidate to be * prefaulted in. This routine is optional. * * @param _pmap physical map * @param _va virtual address to test * * @retval boolean TRUE if the address is a candidate. */ METHOD boolean_t is_prefaultable { mmu_t _mmu; pmap_t _pmap; vm_offset_t _va; } DEFAULT mmu_null_is_prefaultable; /** * @brief Return whether or not the specified physical page was referenced * in any physical maps. * * @params _pg physical page * * @retval boolean TRUE if page has been referenced */ METHOD boolean_t is_referenced { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Return a count of referenced bits for a page, clearing those bits. * Not all referenced bits need to be cleared, but it is necessary that 0 * only be returned when there are none set. * * @params _m physical page * * @retval int count of referenced bits */ METHOD int ts_referenced { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Map the requested physical address range into kernel virtual * address space. The value in _virt is taken as a hint. The virtual * address of the range is returned, or NULL if the mapping could not * be created. The range can be direct-mapped if that is supported. * * @param *_virt Hint for start virtual address, and also return * value * @param _start physical address range start * @param _end physical address range end * @param _prot protection of range (currently ignored) * * @retval NULL could not map the area * @retval addr, *_virt mapping start virtual address */ METHOD vm_offset_t map { mmu_t _mmu; vm_offset_t *_virt; vm_paddr_t _start; vm_paddr_t _end; int _prot; }; /** * @brief Used to create a contiguous set of read-only mappings for a * given object to try and eliminate a cascade of on-demand faults as * the object is accessed sequentially. This routine is optional. * * @param _pmap physical map * @param _addr mapping start virtual address * @param _object device-backed V.M. object to be mapped * @param _pindex page-index within object of mapping start * @param _size size in bytes of mapping */ METHOD void object_init_pt { mmu_t _mmu; pmap_t _pmap; vm_offset_t _addr; vm_object_t _object; vm_pindex_t _pindex; vm_size_t _size; } DEFAULT mmu_null_object_init_pt; /** * @brief Used to determine if the specified page has a mapping for the * given physical map, by scanning the list of reverse-mappings from the * page. The list is scanned to a maximum of 16 entries. * * @param _pmap physical map * @param _pg physical page * * @retval bool TRUE if the physical map was found in the first 16 * reverse-map list entries off the physical page. */ METHOD boolean_t page_exists_quick { mmu_t _mmu; pmap_t _pmap; vm_page_t _pg; }; /** * @brief Initialise the machine-dependent section of the physical page * data structure. This routine is optional. * * @param _pg physical page */ METHOD void page_init { mmu_t _mmu; vm_page_t _pg; } DEFAULT mmu_null_page_init; /** * @brief Count the number of managed mappings to the given physical * page that are wired. * * @param _pg physical page * * @retval int the number of wired, managed mappings to the * given physical page */ METHOD int page_wired_mappings { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Initialise a physical map data structure * * @param _pmap physical map */ METHOD void pinit { mmu_t _mmu; pmap_t _pmap; }; /** * @brief Initialise the physical map for process 0, the initial process * in the system. * XXX default to pinit ? * * @param _pmap physical map */ METHOD void pinit0 { mmu_t _mmu; pmap_t _pmap; }; /** * @brief Set the protection for physical pages in the given virtual address * range to the given value. * * @param _pmap physical map * @param _start virtual range start * @param _end virtual range end * @param _prot new page protection */ METHOD void protect { mmu_t _mmu; pmap_t _pmap; vm_offset_t _start; vm_offset_t _end; vm_prot_t _prot; }; /** * @brief Create a mapping in kernel virtual address space for the given array * of wired physical pages. * * @param _start mapping virtual address start * @param *_m array of physical page pointers * @param _count array elements */ METHOD void qenter { mmu_t _mmu; vm_offset_t _start; vm_page_t *_pg; int _count; }; /** * @brief Remove the temporary mappings created by qenter. * * @param _start mapping virtual address start * @param _count number of pages in mapping */ METHOD void qremove { mmu_t _mmu; vm_offset_t _start; int _count; }; /** * @brief Release per-pmap resources, e.g. mutexes, allocated memory etc. There * should be no existing mappings for the physical map at this point * * @param _pmap physical map */ METHOD void release { mmu_t _mmu; pmap_t _pmap; }; /** * @brief Remove all mappings in the given physical map for the start/end * virtual address range. The range will be page-aligned. * * @param _pmap physical map * @param _start mapping virtual address start * @param _end mapping virtual address end */ METHOD void remove { mmu_t _mmu; pmap_t _pmap; vm_offset_t _start; vm_offset_t _end; }; /** * @brief Traverse the reverse-map list off the given physical page and * remove all mappings. Clear the PGA_WRITEABLE attribute from the page. * * @param _pg physical page */ METHOD void remove_all { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Remove all mappings in the given start/end virtual address range * for the given physical map. Similar to the remove method, but it used * when tearing down all mappings in an address space. This method is * optional, since pmap_remove will be called for each valid vm_map in * the address space later. * * @param _pmap physical map * @param _start mapping virtual address start * @param _end mapping virtual address end */ METHOD void remove_pages { mmu_t _mmu; pmap_t _pmap; } DEFAULT mmu_null_remove_pages; /** * @brief Clear the wired attribute from the mappings for the specified range * of addresses in the given pmap. * * @param _pmap physical map * @param _start virtual range start * @param _end virtual range end */ METHOD void unwire { mmu_t _mmu; pmap_t _pmap; vm_offset_t _start; vm_offset_t _end; }; /** * @brief Zero a physical page. It is not assumed that the page is mapped, * so a temporary (or direct) mapping may need to be used. * * @param _pg physical page */ METHOD void zero_page { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Zero a portion of a physical page, starting at a given offset and * for a given size (multiples of 512 bytes for 4k pages). * * @param _pg physical page * @param _off byte offset from start of page * @param _size size of area to zero */ METHOD void zero_page_area { mmu_t _mmu; vm_page_t _pg; int _off; int _size; }; /** * @brief Extract mincore(2) information from a mapping. * * @param _pmap physical map * @param _addr page virtual address * @param _pa page physical address * * @retval 0 no result * @retval non-zero mincore(2) flag values */ METHOD int mincore { mmu_t _mmu; pmap_t _pmap; vm_offset_t _addr; vm_paddr_t *_pap; } DEFAULT mmu_null_mincore; /** * @brief Perform any operations required to allow a physical map to be used * before it's address space is accessed. * * @param _td thread associated with physical map */ METHOD void activate { mmu_t _mmu; struct thread *_td; }; /** * @brief Perform any operations required to deactivate a physical map, * for instance as it is context-switched out. * * @param _td thread associated with physical map */ METHOD void deactivate { mmu_t _mmu; struct thread *_td; } DEFAULT mmu_null_deactivate; /** * @brief Return a hint for the best virtual address to map a tentative * virtual address range in a given VM object. The default is to just * return the given tentative start address. * * @param _obj VM backing object * @param _offset starting offset with the VM object * @param _addr initial guess at virtual address * @param _size size of virtual address range */ METHOD void align_superpage { mmu_t _mmu; vm_object_t _obj; vm_ooffset_t _offset; vm_offset_t *_addr; vm_size_t _size; } DEFAULT mmu_null_align_superpage; /** * INTERNAL INTERFACES */ /** * @brief Bootstrap the VM system. At the completion of this routine, the * kernel will be running in its own address space with full control over * paging. * * @param _start start of reserved memory (obsolete ???) * @param _end end of reserved memory (obsolete ???) * XXX I think the intent of these was to allow * the memory used by kernel text+data+bss and * loader variables/load-time kld's to be carved out * of available physical mem. * */ METHOD void bootstrap { mmu_t _mmu; vm_offset_t _start; vm_offset_t _end; }; /** * @brief Set up the MMU on the current CPU. Only called by the PMAP layer * for alternate CPUs on SMP systems. * * @param _ap Set to 1 if the CPU being set up is an AP * */ METHOD void cpu_bootstrap { mmu_t _mmu; int _ap; }; /** * @brief Create a kernel mapping for a given physical address range. * Called by bus code on behalf of device drivers. The mapping does not * have to be a virtual address: it can be a direct-mapped physical address * if that is supported by the MMU. * * @param _pa start physical address * @param _size size in bytes of mapping * * @retval addr address of mapping. */ METHOD void * mapdev { mmu_t _mmu; vm_paddr_t _pa; vm_size_t _size; }; /** * @brief Create a kernel mapping for a given physical address range. * Called by bus code on behalf of device drivers. The mapping does not * have to be a virtual address: it can be a direct-mapped physical address * if that is supported by the MMU. * * @param _pa start physical address * @param _size size in bytes of mapping * @param _attr cache attributes * * @retval addr address of mapping. */ METHOD void * mapdev_attr { mmu_t _mmu; vm_paddr_t _pa; vm_size_t _size; vm_memattr_t _attr; } DEFAULT mmu_null_mapdev_attr; /** * @brief Change cache control attributes for a page. Should modify all * mappings for that page. * * @param _m page to modify * @param _ma new cache control attributes */ METHOD void page_set_memattr { mmu_t _mmu; vm_page_t _pg; vm_memattr_t _ma; } DEFAULT mmu_null_page_set_memattr; /** * @brief Remove the mapping created by mapdev. Called when a driver * is unloaded. * * @param _va Mapping address returned from mapdev * @param _size size in bytes of mapping */ METHOD void unmapdev { mmu_t _mmu; vm_offset_t _va; vm_size_t _size; }; /** * @brief Provide a kernel-space pointer that can be used to access the * given userland address. The kernel accessible length returned in klen * may be less than the requested length of the userland buffer (ulen). If * so, retry with a higher address to get access to the later parts of the * buffer. Returns EFAULT if no mapping can be made, else zero. * * @param _pm PMAP for the user pointer. * @param _uaddr Userland address to map. * @param _kaddr Corresponding kernel address. * @param _ulen Length of user buffer. * @param _klen Available subset of ulen with _kaddr. */ METHOD int map_user_ptr { mmu_t _mmu; pmap_t _pm; volatile const void *_uaddr; void **_kaddr; size_t _ulen; size_t *_klen; }; /** * @brief Decode a kernel pointer, as visible to the current thread, * by setting whether it corresponds to a user or kernel address and * the address in the respective memory maps to which the address as * seen in the kernel corresponds. This is essentially the inverse of * MMU_MAP_USER_PTR() above and is used in kernel-space fault handling. * Returns 0 on success or EFAULT if the address could not be mapped. */ METHOD int decode_kernel_ptr { mmu_t _mmu; vm_offset_t addr; int *is_user; vm_offset_t *decoded_addr; }; /** * @brief Reverse-map a kernel virtual address * * @param _va kernel virtual address to reverse-map * * @retval pa physical address corresponding to mapping */ METHOD vm_paddr_t kextract { mmu_t _mmu; vm_offset_t _va; }; /** * @brief Map a wired page into kernel virtual address space * * @param _va mapping virtual address * @param _pa mapping physical address */ METHOD void kenter { mmu_t _mmu; vm_offset_t _va; vm_paddr_t _pa; }; /** * @brief Map a wired page into kernel virtual address space * * @param _va mapping virtual address * @param _pa mapping physical address * @param _ma mapping cache control attributes */ METHOD void kenter_attr { mmu_t _mmu; vm_offset_t _va; vm_paddr_t _pa; vm_memattr_t _ma; } DEFAULT mmu_null_kenter_attr; /** * @brief Unmap a wired page from kernel virtual address space * * @param _va mapped virtual address */ METHOD void kremove { mmu_t _mmu; vm_offset_t _va; }; /** * @brief Determine if the given physical address range has been direct-mapped. * * @param _pa physical address start * @param _size physical address range size * * @retval bool TRUE if the range is direct-mapped. */ METHOD boolean_t dev_direct_mapped { mmu_t _mmu; vm_paddr_t _pa; vm_size_t _size; }; /** * @brief Enforce instruction cache coherency. Typically called after a * region of memory has been modified and before execution of or within * that region is attempted. Setting breakpoints in a process through * ptrace(2) is one example of when the instruction cache needs to be * made coherent. * * @param _pm the physical map of the virtual address * @param _va the virtual address of the modified region * @param _sz the size of the modified region */ METHOD void sync_icache { mmu_t _mmu; pmap_t _pm; vm_offset_t _va; vm_size_t _sz; }; /** * @brief Create temporary memory mapping for use by dumpsys(). * * @param _pa The physical page to map. * @param _sz The requested size of the mapping. * @param _va The virtual address of the mapping. */ METHOD void dumpsys_map { mmu_t _mmu; vm_paddr_t _pa; size_t _sz; void **_va; }; /** * @brief Remove temporary dumpsys() mapping. * * @param _pa The physical page to map. * @param _sz The requested size of the mapping. * @param _va The virtual address of the mapping. */ METHOD void dumpsys_unmap { mmu_t _mmu; vm_paddr_t _pa; size_t _sz; void *_va; }; /** * @brief Initialize memory chunks for dumpsys. */ METHOD void scan_init { mmu_t _mmu; }; /** * @brief Scan kernel PMAP, adding mapped physical pages to dump. * * @retval pmap_size Number of bytes used by all PTE entries. */ METHOD size_t scan_pmap { mmu_t _mmu; } DEFAULT mmu_null_scan_pmap; /** * @brief Initialize a PMAP dump. * * @param _blkpgs Size of a dump block, in pages. * * @retval ctx Dump context, used by dump_pmap. */ METHOD void * dump_pmap_init { mmu_t _mmu; unsigned _blkpgs; } DEFAULT mmu_null_dump_pmap_init; /** * @brief Dump a block of PTEs. * The size of the dump block is specified in dump_pmap_init and * the 'buf' argument must be big enough to hold a full block. * If the page table resides in regular memory, then the 'buf' * argument is ignored and a pointer to the specified dump block * is returned instead, avoiding memory copy. Else, the buffer is * filled with PTEs and the own buffer pointer is returned. * In the end, the cursor in 'ctx' is adjusted to point to the next block. * * @param _ctx Dump context, retrieved from dump_pmap_init. * @param _buf Buffer to hold the dump block contents. * @param _nbytes Number of bytes dumped. * * @retval NULL No more blocks to dump. * @retval buf Pointer to dumped data (may be different than _buf). */ METHOD void * dump_pmap { mmu_t _mmu; void *_ctx; void *_buf; u_long *_nbytes; } DEFAULT mmu_null_dump_pmap; /** * @brief Create a temporary thread-local KVA mapping of a single page. * * @param _pg The physical page to map * * @retval addr The temporary KVA */ METHOD vm_offset_t quick_enter_page { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Undo a mapping created by quick_enter_page * * @param _va The mapped KVA */ METHOD void quick_remove_page { mmu_t _mmu; vm_offset_t _va; }; /** * @brief Change the specified virtual address range's memory type. * * @param _va The virtual base address to change * * @param _sz Size of the region to change * * @param _mode New mode to set on the VA range * * @retval error 0 on success, EINVAL or ENOMEM on error. */ METHOD int change_attr { mmu_t _mmu; vm_offset_t _va; vm_size_t _sz; vm_memattr_t _mode; } DEFAULT mmu_null_change_attr; /** * @brief Initialize the page array. * * @param _pages The number of pages to be accounted by the array. */ METHOD void page_array_startup { mmu_t _mmu; long _pages; }; + +METHOD boolean_t page_is_mapped { + mmu_t _mmu; + vm_page_t _pg; +} DEFAULT; + +METHOD boolean_t ps_enabled { + mmu_t _mmu; + pmap_t _pmap; +} DEFAULT mmu_null_ps_enabled; Index: head/sys/powerpc/powerpc/pmap_dispatch.c =================================================================== --- head/sys/powerpc/powerpc/pmap_dispatch.c (revision 360886) +++ head/sys/powerpc/powerpc/pmap_dispatch.c (revision 360887) @@ -1,674 +1,688 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Peter Grehan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); /* * Dispatch MI pmap calls to the appropriate MMU implementation * through a previously registered kernel object. * * Before pmap_bootstrap() can be called, a CPU module must have * called pmap_mmu_install(). This may be called multiple times: * the highest priority call will be installed as the default * MMU handler when pmap_bootstrap() is called. * * It is required that mutex_init() be called before pmap_bootstrap(), * as the PMAP layer makes extensive use of mutexes. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmu_if.h" static mmu_def_t *mmu_def_impl; static mmu_t mmu_obj; static struct mmu_kobj mmu_kernel_obj; static struct kobj_ops mmu_kernel_kops; /* * pmap globals */ struct pmap kernel_pmap_store; vm_offset_t msgbuf_phys; vm_offset_t kernel_vm_end; vm_offset_t virtual_avail; vm_offset_t virtual_end; caddr_t crashdumpmap; int pmap_bootstrapped; #ifdef AIM int pvo_vaddr_compare(struct pvo_entry *a, struct pvo_entry *b) { if (PVO_VADDR(a) < PVO_VADDR(b)) return (-1); else if (PVO_VADDR(a) > PVO_VADDR(b)) return (1); return (0); } RB_GENERATE(pvo_tree, pvo_entry, pvo_plink, pvo_vaddr_compare); #endif void pmap_advise(pmap_t pmap, vm_offset_t start, vm_offset_t end, int advice) { CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %d)", __func__, pmap, start, end, advice); MMU_ADVISE(mmu_obj, pmap, start, end, advice); } void pmap_clear_modify(vm_page_t m) { CTR2(KTR_PMAP, "%s(%p)", __func__, m); MMU_CLEAR_MODIFY(mmu_obj, m); } void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { CTR6(KTR_PMAP, "%s(%p, %p, %#x, %#x, %#x)", __func__, dst_pmap, src_pmap, dst_addr, len, src_addr); MMU_COPY(mmu_obj, dst_pmap, src_pmap, dst_addr, len, src_addr); } void pmap_copy_page(vm_page_t src, vm_page_t dst) { CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst); MMU_COPY_PAGE(mmu_obj, src, dst); } void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma, a_offset, mb, b_offset, xfersize); MMU_COPY_PAGES(mmu_obj, ma, a_offset, mb, b_offset, xfersize); } int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t p, vm_prot_t prot, u_int flags, int8_t psind) { CTR6(KTR_PMAP, "pmap_enter(%p, %#x, %p, %#x, %#x, %d)", pmap, va, p, prot, flags, psind); return (MMU_ENTER(mmu_obj, pmap, va, p, prot, flags, psind)); } void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start, end, m_start, prot); MMU_ENTER_OBJECT(mmu_obj, pmap, start, end, m_start, prot); } void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, pmap, va, m, prot); MMU_ENTER_QUICK(mmu_obj, pmap, va, m, prot); } vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, va); return (MMU_EXTRACT(mmu_obj, pmap, va)); } vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot); return (MMU_EXTRACT_AND_HOLD(mmu_obj, pmap, va, prot)); } void pmap_growkernel(vm_offset_t va) { CTR2(KTR_PMAP, "%s(%#x)", __func__, va); MMU_GROWKERNEL(mmu_obj, va); } void pmap_init(void) { CTR1(KTR_PMAP, "%s()", __func__); MMU_INIT(mmu_obj); } boolean_t pmap_is_modified(vm_page_t m) { CTR2(KTR_PMAP, "%s(%p)", __func__, m); return (MMU_IS_MODIFIED(mmu_obj, m)); } boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t va) { CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, va); return (MMU_IS_PREFAULTABLE(mmu_obj, pmap, va)); } boolean_t pmap_is_referenced(vm_page_t m) { CTR2(KTR_PMAP, "%s(%p)", __func__, m); return (MMU_IS_REFERENCED(mmu_obj, m)); } boolean_t pmap_ts_referenced(vm_page_t m) { CTR2(KTR_PMAP, "%s(%p)", __func__, m); return (MMU_TS_REFERENCED(mmu_obj, m)); } vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end, prot); return (MMU_MAP(mmu_obj, virt, start, end, prot)); } void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr, object, pindex, size); MMU_OBJECT_INIT_PT(mmu_obj, pmap, addr, object, pindex, size); } boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m); return (MMU_PAGE_EXISTS_QUICK(mmu_obj, pmap, m)); } void pmap_page_init(vm_page_t m) { CTR2(KTR_PMAP, "%s(%p)", __func__, m); MMU_PAGE_INIT(mmu_obj, m); } int pmap_page_wired_mappings(vm_page_t m) { CTR2(KTR_PMAP, "%s(%p)", __func__, m); return (MMU_PAGE_WIRED_MAPPINGS(mmu_obj, m)); } int pmap_pinit(pmap_t pmap) { CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); MMU_PINIT(mmu_obj, pmap); return (1); } void pmap_pinit0(pmap_t pmap) { CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); MMU_PINIT0(mmu_obj, pmap); } void pmap_protect(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_prot_t prot) { CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, start, end, prot); MMU_PROTECT(mmu_obj, pmap, start, end, prot); } void pmap_qenter(vm_offset_t start, vm_page_t *m, int count) { CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, start, m, count); MMU_QENTER(mmu_obj, start, m, count); } void pmap_qremove(vm_offset_t start, int count) { CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, start, count); MMU_QREMOVE(mmu_obj, start, count); } void pmap_release(pmap_t pmap) { CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); MMU_RELEASE(mmu_obj, pmap); } void pmap_remove(pmap_t pmap, vm_offset_t start, vm_offset_t end) { CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, start, end); MMU_REMOVE(mmu_obj, pmap, start, end); } void pmap_remove_all(vm_page_t m) { CTR2(KTR_PMAP, "%s(%p)", __func__, m); MMU_REMOVE_ALL(mmu_obj, m); } void pmap_remove_pages(pmap_t pmap) { CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); MMU_REMOVE_PAGES(mmu_obj, pmap); } void pmap_remove_write(vm_page_t m) { CTR2(KTR_PMAP, "%s(%p)", __func__, m); MMU_REMOVE_WRITE(mmu_obj, m); } void pmap_unwire(pmap_t pmap, vm_offset_t start, vm_offset_t end) { CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, start, end); MMU_UNWIRE(mmu_obj, pmap, start, end); } void pmap_zero_page(vm_page_t m) { CTR2(KTR_PMAP, "%s(%p)", __func__, m); MMU_ZERO_PAGE(mmu_obj, m); } void pmap_zero_page_area(vm_page_t m, int off, int size) { CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size); MMU_ZERO_PAGE_AREA(mmu_obj, m, off, size); } int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); return (MMU_MINCORE(mmu_obj, pmap, addr, pap)); } void pmap_activate(struct thread *td) { CTR2(KTR_PMAP, "%s(%p)", __func__, td); MMU_ACTIVATE(mmu_obj, td); } void pmap_deactivate(struct thread *td) { CTR2(KTR_PMAP, "%s(%p)", __func__, td); MMU_DEACTIVATE(mmu_obj, td); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr, size); MMU_ALIGN_SUPERPAGE(mmu_obj, object, offset, addr, size); } /* * Routines used in machine-dependent code */ void pmap_bootstrap(vm_offset_t start, vm_offset_t end) { mmu_obj = &mmu_kernel_obj; /* * Take care of compiling the selected class, and * then statically initialise the MMU object */ kobj_class_compile_static(mmu_def_impl, &mmu_kernel_kops); kobj_init_static((kobj_t)mmu_obj, mmu_def_impl); MMU_BOOTSTRAP(mmu_obj, start, end); } void pmap_cpu_bootstrap(int ap) { /* * No KTR here because our console probably doesn't work yet */ return (MMU_CPU_BOOTSTRAP(mmu_obj, ap)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); return (MMU_MAPDEV(mmu_obj, pa, size)); } void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t attr) { CTR4(KTR_PMAP, "%s(%#x, %#x, %#x)", __func__, pa, size, attr); return (MMU_MAPDEV_ATTR(mmu_obj, pa, size, attr)); } void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma); return (MMU_PAGE_SET_MEMATTR(mmu_obj, m, ma)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, va, size); MMU_UNMAPDEV(mmu_obj, va, size); } vm_paddr_t pmap_kextract(vm_offset_t va) { CTR2(KTR_PMAP, "%s(%#x)", __func__, va); return (MMU_KEXTRACT(mmu_obj, va)); } void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, va, pa); MMU_KENTER(mmu_obj, va, pa); } void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { CTR4(KTR_PMAP, "%s(%#x, %#x, %#x)", __func__, va, pa, ma); MMU_KENTER_ATTR(mmu_obj, va, pa, ma); } void pmap_kremove(vm_offset_t va) { CTR2(KTR_PMAP, "%s(%#x)", __func__, va); return (MMU_KREMOVE(mmu_obj, va)); } int pmap_map_user_ptr(pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen) { CTR2(KTR_PMAP, "%s(%p)", __func__, uaddr); return (MMU_MAP_USER_PTR(mmu_obj, pm, uaddr, kaddr, ulen, klen)); } int pmap_decode_kernel_ptr(vm_offset_t addr, int *is_user, vm_offset_t *decoded) { CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr); return (MMU_DECODE_KERNEL_PTR(mmu_obj, addr, is_user, decoded)); } boolean_t pmap_dev_direct_mapped(vm_paddr_t pa, vm_size_t size) { CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); return (MMU_DEV_DIRECT_MAPPED(mmu_obj, pa, size)); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pm, va, sz); return (MMU_SYNC_ICACHE(mmu_obj, pm, va, sz)); } void dumpsys_map_chunk(vm_paddr_t pa, size_t sz, void **va) { CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va); return (MMU_DUMPSYS_MAP(mmu_obj, pa, sz, va)); } void dumpsys_unmap_chunk(vm_paddr_t pa, size_t sz, void *va) { CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va); return (MMU_DUMPSYS_UNMAP(mmu_obj, pa, sz, va)); } void dumpsys_pa_init(void) { CTR1(KTR_PMAP, "%s()", __func__); return (MMU_SCAN_INIT(mmu_obj)); } size_t dumpsys_scan_pmap(void) { CTR1(KTR_PMAP, "%s()", __func__); return (MMU_SCAN_PMAP(mmu_obj)); } void * dumpsys_dump_pmap_init(unsigned blkpgs) { CTR1(KTR_PMAP, "%s()", __func__); return (MMU_DUMP_PMAP_INIT(mmu_obj, blkpgs)); } void * dumpsys_dump_pmap(void *ctx, void *buf, u_long *nbytes) { CTR1(KTR_PMAP, "%s()", __func__); return (MMU_DUMP_PMAP(mmu_obj, ctx, buf, nbytes)); } vm_offset_t pmap_quick_enter_page(vm_page_t m) { CTR2(KTR_PMAP, "%s(%p)", __func__, m); return (MMU_QUICK_ENTER_PAGE(mmu_obj, m)); } void pmap_quick_remove_page(vm_offset_t addr) { CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); MMU_QUICK_REMOVE_PAGE(mmu_obj, addr); } int pmap_change_attr(vm_offset_t addr, vm_size_t size, vm_memattr_t mode) { CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, addr, size, mode); return (MMU_CHANGE_ATTR(mmu_obj, addr, size, mode)); } void pmap_page_array_startup(long pages) { CTR2(KTR_PMAP, "%s(%ld)", __func__, pages); MMU_PAGE_ARRAY_STARTUP(mmu_obj, pages); } +boolean_t +pmap_page_is_mapped(vm_page_t m) +{ + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + return (MMU_PAGE_IS_MAPPED(mmu_obj, m)); +} + +bool +pmap_ps_enabled(pmap_t pmap) +{ + CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); + return (MMU_PS_ENABLED(mmu_obj, pmap)); +} + /* * MMU install routines. Highest priority wins, equal priority also * overrides allowing last-set to win. */ SET_DECLARE(mmu_set, mmu_def_t); boolean_t pmap_mmu_install(char *name, int prio) { mmu_def_t **mmupp, *mmup; static int curr_prio = 0; /* * Try and locate the MMU kobj corresponding to the name */ SET_FOREACH(mmupp, mmu_set) { mmup = *mmupp; if (mmup->name && !strcmp(mmup->name, name) && (prio >= curr_prio || mmu_def_impl == NULL)) { curr_prio = prio; mmu_def_impl = mmup; return (TRUE); } } return (FALSE); } const char * pmap_mmu_name(void) { return (mmu_obj->ops->cls->name); } int unmapped_buf_allowed; boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { switch (mode) { case VM_MEMATTR_DEFAULT: case VM_MEMATTR_UNCACHEABLE: case VM_MEMATTR_CACHEABLE: case VM_MEMATTR_WRITE_COMBINING: case VM_MEMATTR_WRITE_BACK: case VM_MEMATTR_WRITE_THROUGH: case VM_MEMATTR_PREFETCHABLE: return (TRUE); default: return (FALSE); } } Index: head/sys/powerpc/powerpc/trap.c =================================================================== --- head/sys/powerpc/powerpc/trap.c (revision 360886) +++ head/sys/powerpc/powerpc/trap.c (revision 360887) @@ -1,937 +1,968 @@ /*- * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: trap.c,v 1.58 2002/03/04 04:07:35 dbj Exp $ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Below matches setjmp.S */ #define FAULTBUF_LR 21 #define FAULTBUF_R1 1 #define FAULTBUF_R2 2 #define FAULTBUF_CR 22 #define FAULTBUF_R14 3 #define MOREARGS(sp) ((caddr_t)((uintptr_t)(sp) + \ sizeof(struct callframe) - 3*sizeof(register_t))) /* more args go here */ static void trap_fatal(struct trapframe *frame); static void printtrap(u_int vector, struct trapframe *frame, int isfatal, int user); static bool trap_pfault(struct trapframe *frame, bool user, int *signo, int *ucode); static int fix_unaligned(struct thread *td, struct trapframe *frame); static int handle_onfault(struct trapframe *frame); static void syscall(struct trapframe *frame); #if defined(__powerpc64__) && defined(AIM) static void normalize_inputs(void); #endif extern vm_offset_t __startkernel; #ifdef KDB int db_trap_glue(struct trapframe *); /* Called from trap_subr.S */ #endif struct powerpc_exception { u_int vector; char *name; }; #ifdef KDTRACE_HOOKS #include int (*dtrace_invop_jump_addr)(struct trapframe *); #endif static struct powerpc_exception powerpc_exceptions[] = { { EXC_CRIT, "critical input" }, { EXC_RST, "system reset" }, { EXC_MCHK, "machine check" }, { EXC_DSI, "data storage interrupt" }, { EXC_DSE, "data segment exception" }, { EXC_ISI, "instruction storage interrupt" }, { EXC_ISE, "instruction segment exception" }, { EXC_EXI, "external interrupt" }, { EXC_ALI, "alignment" }, { EXC_PGM, "program" }, { EXC_HEA, "hypervisor emulation assistance" }, { EXC_FPU, "floating-point unavailable" }, { EXC_APU, "auxiliary proc unavailable" }, { EXC_DECR, "decrementer" }, { EXC_FIT, "fixed-interval timer" }, { EXC_WDOG, "watchdog timer" }, { EXC_SC, "system call" }, { EXC_TRC, "trace" }, { EXC_FPA, "floating-point assist" }, { EXC_DEBUG, "debug" }, { EXC_PERF, "performance monitoring" }, { EXC_VEC, "altivec unavailable" }, { EXC_VSX, "vsx unavailable" }, { EXC_FAC, "facility unavailable" }, { EXC_ITMISS, "instruction tlb miss" }, { EXC_DLMISS, "data load tlb miss" }, { EXC_DSMISS, "data store tlb miss" }, { EXC_BPT, "instruction breakpoint" }, { EXC_SMI, "system management" }, { EXC_VECAST_G4, "altivec assist" }, { EXC_THRM, "thermal management" }, { EXC_RUNMODETRC, "run mode/trace" }, { EXC_SOFT_PATCH, "soft patch exception" }, { EXC_LAST, NULL } }; #define ESR_BITMASK \ "\20" \ "\040b0\037b1\036b2\035b3\034PIL\033PRR\032PTR\031FP" \ "\030ST\027b9\026DLK\025ILK\024b12\023b13\022BO\021PIE" \ "\020b16\017b17\016b18\015b19\014b20\013b21\012b22\011b23" \ "\010SPE\007EPID\006b26\005b27\004b28\003b29\002b30\001b31" #define MCSR_BITMASK \ "\20" \ "\040MCP\037ICERR\036DCERR\035TLBPERR\034L2MMU_MHIT\033b5\032b6\031b7" \ "\030b8\027b9\026b10\025NMI\024MAV\023MEA\022b14\021IF" \ "\020LD\017ST\016LDG\015b19\014b20\013b21\012b22\011b23" \ "\010b24\007b25\006b26\005b27\004b28\003b29\002TLBSYNC\001BSL2_ERR" #define MSSSR_BITMASK \ "\20" \ "\040b0\037b1\036b2\035b3\034b4\033b5\032b6\031b7" \ "\030b8\027b9\026b10\025b11\024b12\023L2TAG\022L2DAT\021L3TAG" \ "\020L3DAT\017APE\016DPE\015TEA\014b20\013b21\012b22\011b23" \ "\010b24\007b25\006b26\005b27\004b28\003b29\002b30\001b31" static const char * trapname(u_int vector) { struct powerpc_exception *pe; for (pe = powerpc_exceptions; pe->vector != EXC_LAST; pe++) { if (pe->vector == vector) return (pe->name); } return ("unknown"); } static inline bool frame_is_trap_inst(struct trapframe *frame) { #ifdef AIM return (frame->exc == EXC_PGM && frame->srr1 & EXC_PGM_TRAP); #else return ((frame->cpu.booke.esr & ESR_PTR) != 0); #endif } void trap(struct trapframe *frame) { struct thread *td; struct proc *p; #ifdef KDTRACE_HOOKS uint32_t inst; #endif int sig, type, user; u_int ucode; ksiginfo_t ksi; register_t fscr; VM_CNT_INC(v_trap); #ifdef KDB if (kdb_active) { kdb_reenter(); return; } #endif td = curthread; p = td->td_proc; type = ucode = frame->exc; sig = 0; user = frame->srr1 & PSL_PR; CTR3(KTR_TRAP, "trap: %s type=%s (%s)", td->td_name, trapname(type), user ? "user" : "kernel"); #ifdef KDTRACE_HOOKS /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * If the DTrace kernel module has registered a trap handler, * call it and if it returns non-zero, assume that it has * handled the trap and modified the trap frame so that this * function can return normally. */ if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type) != 0) return; #endif if (user) { td->td_pticks = 0; td->td_frame = frame; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); /* User Mode Traps */ switch (type) { case EXC_RUNMODETRC: case EXC_TRC: frame->srr1 &= ~PSL_SE; sig = SIGTRAP; ucode = TRAP_TRACE; break; #if defined(__powerpc64__) && defined(AIM) case EXC_ISE: case EXC_DSE: - if (handle_user_slb_spill(&p->p_vmspace->vm_pmap, + /* DSE/ISE are automatically fatal with radix pmap. */ + if (radix_mmu || + handle_user_slb_spill(&p->p_vmspace->vm_pmap, (type == EXC_ISE) ? frame->srr0 : frame->dar) != 0){ sig = SIGSEGV; ucode = SEGV_MAPERR; } break; #endif case EXC_DSI: case EXC_ISI: if (trap_pfault(frame, true, &sig, &ucode)) sig = 0; break; case EXC_SC: syscall(frame); break; case EXC_FPU: KASSERT((td->td_pcb->pcb_flags & PCB_FPU) != PCB_FPU, ("FPU already enabled for thread")); enable_fpu(td); break; case EXC_VEC: KASSERT((td->td_pcb->pcb_flags & PCB_VEC) != PCB_VEC, ("Altivec already enabled for thread")); enable_vec(td); break; case EXC_VSX: KASSERT((td->td_pcb->pcb_flags & PCB_VSX) != PCB_VSX, ("VSX already enabled for thread")); if (!(td->td_pcb->pcb_flags & PCB_VEC)) enable_vec(td); if (td->td_pcb->pcb_flags & PCB_FPU) save_fpu(td); td->td_pcb->pcb_flags |= PCB_VSX; enable_fpu(td); break; case EXC_FAC: fscr = mfspr(SPR_FSCR); switch (fscr & FSCR_IC_MASK) { case FSCR_IC_HTM: CTR0(KTR_TRAP, "Hardware Transactional Memory subsystem disabled"); sig = SIGILL; ucode = ILL_ILLOPC; break; case FSCR_IC_DSCR: td->td_pcb->pcb_flags |= PCB_CFSCR | PCB_CDSCR; fscr |= FSCR_DSCR; mtspr(SPR_DSCR, 0); break; case FSCR_IC_EBB: td->td_pcb->pcb_flags |= PCB_CFSCR; fscr |= FSCR_EBB; mtspr(SPR_EBBHR, 0); mtspr(SPR_EBBRR, 0); mtspr(SPR_BESCR, 0); break; case FSCR_IC_TAR: td->td_pcb->pcb_flags |= PCB_CFSCR; fscr |= FSCR_TAR; mtspr(SPR_TAR, 0); break; case FSCR_IC_LM: td->td_pcb->pcb_flags |= PCB_CFSCR; fscr |= FSCR_LM; mtspr(SPR_LMRR, 0); mtspr(SPR_LMSER, 0); break; default: sig = SIGILL; ucode = ILL_ILLOPC; } mtspr(SPR_FSCR, fscr & ~FSCR_IC_MASK); break; case EXC_HEA: sig = SIGILL; ucode = ILL_ILLOPC; break; case EXC_VECAST_E: case EXC_VECAST_G4: case EXC_VECAST_G5: /* * We get a VPU assist exception for IEEE mode * vector operations on denormalized floats. * Emulating this is a giant pain, so for now, * just switch off IEEE mode and treat them as * zero. */ save_vec(td); td->td_pcb->pcb_vec.vscr |= ALTIVEC_VSCR_NJ; enable_vec(td); break; case EXC_ALI: if (fix_unaligned(td, frame) != 0) { sig = SIGBUS; ucode = BUS_ADRALN; } else frame->srr0 += 4; break; case EXC_DEBUG: /* Single stepping */ mtspr(SPR_DBSR, mfspr(SPR_DBSR)); frame->srr1 &= ~PSL_DE; frame->cpu.booke.dbcr0 &= ~(DBCR0_IDM | DBCR0_IC); sig = SIGTRAP; ucode = TRAP_TRACE; break; case EXC_PGM: /* Identify the trap reason */ if (frame_is_trap_inst(frame)) { #ifdef KDTRACE_HOOKS inst = fuword32((const void *)frame->srr0); if (inst == 0x0FFFDDDD && dtrace_pid_probe_ptr != NULL) { (*dtrace_pid_probe_ptr)(frame); break; } #endif sig = SIGTRAP; ucode = TRAP_BRKPT; } else { sig = ppc_instr_emulate(frame, td); if (sig == SIGILL) { if (frame->srr1 & EXC_PGM_PRIV) ucode = ILL_PRVOPC; else if (frame->srr1 & EXC_PGM_ILLEGAL) ucode = ILL_ILLOPC; } else if (sig == SIGFPE) ucode = FPE_FLTINV; /* Punt for now, invalid operation. */ } break; case EXC_MCHK: sig = cpu_machine_check(td, frame, &ucode); printtrap(frame->exc, frame, 0, (frame->srr1 & PSL_PR)); break; #if defined(__powerpc64__) && defined(AIM) case EXC_SOFT_PATCH: /* * Point to the instruction that generated the exception to execute it again, * and normalize the register values. */ frame->srr0 -= 4; normalize_inputs(); break; #endif default: trap_fatal(frame); } } else { /* Kernel Mode Traps */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case EXC_PGM: #ifdef KDTRACE_HOOKS if (frame_is_trap_inst(frame)) { if (*(uint32_t *)frame->srr0 == EXC_DTRACE) { if (dtrace_invop_jump_addr != NULL) { dtrace_invop_jump_addr(frame); return; } } } #endif #ifdef KDB if (db_trap_glue(frame)) return; #endif break; #if defined(__powerpc64__) && defined(AIM) case EXC_DSE: + /* DSE on radix mmu is automatically fatal. */ + if (radix_mmu) + break; if (td->td_pcb->pcb_cpu.aim.usr_vsid != 0 && (frame->dar & SEGMENT_MASK) == USER_ADDR) { __asm __volatile ("slbmte %0, %1" :: "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE)); return; } break; #endif case EXC_DSI: if (trap_pfault(frame, false, NULL, NULL)) return; break; case EXC_MCHK: if (handle_onfault(frame)) return; break; default: break; } trap_fatal(frame); } if (sig != 0) { if (p->p_sysent->sv_transtrap != NULL) sig = (p->p_sysent->sv_transtrap)(sig, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = (int) ucode; /* XXX, not POSIX */ ksi.ksi_addr = (void *)frame->srr0; ksi.ksi_trapno = type; trapsignal(td, &ksi); } userret(td, frame); } static void trap_fatal(struct trapframe *frame) { #ifdef KDB bool handled; #endif printtrap(frame->exc, frame, 1, (frame->srr1 & PSL_PR)); #ifdef KDB if (debugger_on_trap) { kdb_why = KDB_WHY_TRAP; handled = kdb_trap(frame->exc, 0, frame); kdb_why = KDB_WHY_UNSET; if (handled) return; } #endif panic("%s trap", trapname(frame->exc)); } static void cpu_printtrap(u_int vector, struct trapframe *frame, int isfatal, int user) { #ifdef AIM uint16_t ver; switch (vector) { case EXC_MCHK: ver = mfpvr() >> 16; if (MPC745X_P(ver)) printf(" msssr0 = 0x%b\n", (int)mfspr(SPR_MSSSR0), MSSSR_BITMASK); case EXC_DSE: case EXC_DSI: case EXC_DTMISS: printf(" dsisr = 0x%lx\n", (u_long)frame->cpu.aim.dsisr); break; } #elif defined(BOOKE) vm_paddr_t pa; switch (vector) { case EXC_MCHK: pa = mfspr(SPR_MCARU); pa = (pa << 32) | (u_register_t)mfspr(SPR_MCAR); printf(" mcsr = 0x%b\n", (int)mfspr(SPR_MCSR), MCSR_BITMASK); printf(" mcar = 0x%jx\n", (uintmax_t)pa); } printf(" esr = 0x%b\n", (int)frame->cpu.booke.esr, ESR_BITMASK); #endif } static void printtrap(u_int vector, struct trapframe *frame, int isfatal, int user) { printf("\n"); printf("%s %s trap:\n", isfatal ? "fatal" : "handled", user ? "user" : "kernel"); printf("\n"); printf(" exception = 0x%x (%s)\n", vector, trapname(vector)); switch (vector) { case EXC_DSE: case EXC_DSI: case EXC_DTMISS: case EXC_ALI: case EXC_MCHK: printf(" virtual address = 0x%" PRIxPTR "\n", frame->dar); break; case EXC_ISE: case EXC_ISI: case EXC_ITMISS: printf(" virtual address = 0x%" PRIxPTR "\n", frame->srr0); break; } cpu_printtrap(vector, frame, isfatal, user); printf(" srr0 = 0x%" PRIxPTR " (0x%" PRIxPTR ")\n", frame->srr0, frame->srr0 - (register_t)(__startkernel - KERNBASE)); printf(" srr1 = 0x%lx\n", (u_long)frame->srr1); printf(" current msr = 0x%" PRIxPTR "\n", mfmsr()); printf(" lr = 0x%" PRIxPTR " (0x%" PRIxPTR ")\n", frame->lr, frame->lr - (register_t)(__startkernel - KERNBASE)); printf(" frame = %p\n", frame); printf(" curthread = %p\n", curthread); if (curthread != NULL) printf(" pid = %d, comm = %s\n", curthread->td_proc->p_pid, curthread->td_name); printf("\n"); } /* * Handles a fatal fault when we have onfault state to recover. Returns * non-zero if there was onfault recovery state available. */ static int handle_onfault(struct trapframe *frame) { struct thread *td; jmp_buf *fb; td = curthread; fb = td->td_pcb->pcb_onfault; if (fb != NULL) { frame->srr0 = (*fb)->_jb[FAULTBUF_LR]; frame->fixreg[1] = (*fb)->_jb[FAULTBUF_R1]; frame->fixreg[2] = (*fb)->_jb[FAULTBUF_R2]; frame->fixreg[3] = 1; frame->cr = (*fb)->_jb[FAULTBUF_CR]; bcopy(&(*fb)->_jb[FAULTBUF_R14], &frame->fixreg[14], 18 * sizeof(register_t)); td->td_pcb->pcb_onfault = NULL; /* Returns twice, not thrice */ return (1); } return (0); } int cpu_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; caddr_t params; size_t argsz; int error, n, i; p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; sa->code = frame->fixreg[0]; params = (caddr_t)(frame->fixreg + FIRSTARG); n = NARGREG; if (sa->code == SYS_syscall) { /* * code is first argument, * followed by actual args. */ sa->code = *(register_t *) params; params += sizeof(register_t); n -= 1; } else if (sa->code == SYS___syscall) { /* * Like syscall, but code is a quad, * so as to maintain quad alignment * for the rest of the args. */ if (SV_PROC_FLAG(p, SV_ILP32)) { params += sizeof(register_t); sa->code = *(register_t *) params; params += sizeof(register_t); n -= 2; } else { sa->code = *(register_t *) params; params += sizeof(register_t); n -= 1; } } if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; if (SV_PROC_FLAG(p, SV_ILP32)) { argsz = sizeof(uint32_t); for (i = 0; i < n; i++) sa->args[i] = ((u_register_t *)(params))[i] & 0xffffffff; } else { argsz = sizeof(uint64_t); for (i = 0; i < n; i++) sa->args[i] = ((u_register_t *)(params))[i]; } if (sa->narg > n) error = copyin(MOREARGS(frame->fixreg[1]), sa->args + n, (sa->narg - n) * argsz); else error = 0; #ifdef __powerpc64__ if (SV_PROC_FLAG(p, SV_ILP32) && sa->narg > n) { /* Expand the size of arguments copied from the stack */ for (i = sa->narg; i >= n; i--) sa->args[i] = ((uint32_t *)(&sa->args[n]))[i-n]; } #endif if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->fixreg[FIRSTARG + 1]; } return (error); } #include "../../kern/subr_syscall.c" void syscall(struct trapframe *frame) { struct thread *td; td = curthread; td->td_frame = frame; #if defined(__powerpc64__) && defined(AIM) /* * Speculatively restore last user SLB segment, which we know is * invalid already, since we are likely to do copyin()/copyout(). */ if (td->td_pcb->pcb_cpu.aim.usr_vsid != 0) __asm __volatile ("slbmte %0, %1; isync" :: "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE)); #endif syscallenter(td); syscallret(td); } static bool trap_pfault(struct trapframe *frame, bool user, int *signo, int *ucode) { vm_offset_t eva; struct thread *td; struct proc *p; vm_map_t map; vm_prot_t ftype; int rv, is_user; td = curthread; p = td->td_proc; if (frame->exc == EXC_ISI) { eva = frame->srr0; ftype = VM_PROT_EXECUTE; if (frame->srr1 & SRR1_ISI_PFAULT) ftype |= VM_PROT_READ; } else { eva = frame->dar; #ifdef BOOKE if (frame->cpu.booke.esr & ESR_ST) #else if (frame->cpu.aim.dsisr & DSISR_STORE) #endif ftype = VM_PROT_WRITE; else ftype = VM_PROT_READ; } +#if defined(__powerpc64__) && defined(AIM) + if (radix_mmu && pmap_nofault(&p->p_vmspace->vm_pmap, eva, ftype) == 0) + return (true); +#endif + if (__predict_false((td->td_pflags & TDP_NOFAULTING) == 0)) { + /* + * If we get a page fault while in a critical section, then + * it is most likely a fatal kernel page fault. The kernel + * is already going to panic trying to get a sleep lock to + * do the VM lookup, so just consider it a fatal trap so the + * kernel can print out a useful trap message and even get + * to the debugger. + * + * If we get a page fault while holding a non-sleepable + * lock, then it is most likely a fatal kernel page fault. + * If WITNESS is enabled, then it's going to whine about + * bogus LORs with various VM locks, so just skip to the + * fatal trap handling directly. + */ + if (td->td_critnest != 0 || + WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, + "Kernel page fault") != 0) { + trap_fatal(frame); + return (false); + } + } if (user) { KASSERT(p->p_vmspace != NULL, ("trap_pfault: vmspace NULL")); map = &p->p_vmspace->vm_map; } else { rv = pmap_decode_kernel_ptr(eva, &is_user, &eva); if (rv != 0) return (false); if (is_user) map = &p->p_vmspace->vm_map; else map = kernel_map; } /* Fault in the page. */ rv = vm_fault_trap(map, eva, ftype, VM_FAULT_NORMAL, signo, ucode); /* * XXXDTRACE: add dtrace_doubletrap_func here? */ if (rv == KERN_SUCCESS) return (true); if (!user && handle_onfault(frame)) return (true); return (false); } /* * For now, this only deals with the particular unaligned access case * that gcc tends to generate. Eventually it should handle all of the * possibilities that can happen on a 32-bit PowerPC in big-endian mode. */ static int fix_unaligned(struct thread *td, struct trapframe *frame) { struct thread *fputhread; #ifdef BOOKE uint32_t inst; #endif int indicator, reg; double *fpr; #ifdef __SPE__ indicator = (frame->cpu.booke.esr & (ESR_ST|ESR_SPE)); if (indicator & ESR_SPE) { if (copyin((void *)frame->srr0, &inst, sizeof(inst)) != 0) return (-1); reg = EXC_ALI_INST_RST(inst); fpr = (double *)td->td_pcb->pcb_vec.vr[reg]; fputhread = PCPU_GET(vecthread); /* Juggle the SPE to ensure that we've initialized * the registers, and that their current state is in * the PCB. */ if (fputhread != td) { if (fputhread) save_vec(fputhread); enable_vec(td); } save_vec(td); if (!(indicator & ESR_ST)) { if (copyin((void *)frame->dar, fpr, sizeof(double)) != 0) return (-1); frame->fixreg[reg] = td->td_pcb->pcb_vec.vr[reg][1]; enable_vec(td); } else { td->td_pcb->pcb_vec.vr[reg][1] = frame->fixreg[reg]; if (copyout(fpr, (void *)frame->dar, sizeof(double)) != 0) return (-1); } return (0); } #else #ifdef BOOKE indicator = (frame->cpu.booke.esr & ESR_ST) ? EXC_ALI_STFD : EXC_ALI_LFD; #else indicator = EXC_ALI_OPCODE_INDICATOR(frame->cpu.aim.dsisr); #endif switch (indicator) { case EXC_ALI_LFD: case EXC_ALI_STFD: #ifdef BOOKE if (copyin((void *)frame->srr0, &inst, sizeof(inst)) != 0) return (-1); reg = EXC_ALI_INST_RST(inst); #else reg = EXC_ALI_RST(frame->cpu.aim.dsisr); #endif fpr = &td->td_pcb->pcb_fpu.fpr[reg].fpr; fputhread = PCPU_GET(fputhread); /* Juggle the FPU to ensure that we've initialized * the FPRs, and that their current state is in * the PCB. */ if (fputhread != td) { if (fputhread) save_fpu(fputhread); enable_fpu(td); } save_fpu(td); if (indicator == EXC_ALI_LFD) { if (copyin((void *)frame->dar, fpr, sizeof(double)) != 0) return (-1); enable_fpu(td); } else { if (copyout(fpr, (void *)frame->dar, sizeof(double)) != 0) return (-1); } return (0); break; } #endif return (-1); } #if defined(__powerpc64__) && defined(AIM) #define MSKNSHL(x, m, n) "(((" #x ") & " #m ") << " #n ")" #define MSKNSHR(x, m, n) "(((" #x ") & " #m ") >> " #n ")" /* xvcpsgndp instruction, built in opcode format. * This can be changed to use mnemonic after a toolchain update. */ #define XVCPSGNDP(xt, xa, xb) \ __asm __volatile(".long (" \ MSKNSHL(60, 0x3f, 26) " | " \ MSKNSHL(xt, 0x1f, 21) " | " \ MSKNSHL(xa, 0x1f, 16) " | " \ MSKNSHL(xb, 0x1f, 11) " | " \ MSKNSHL(240, 0xff, 3) " | " \ MSKNSHR(xa, 0x20, 3) " | " \ MSKNSHR(xa, 0x20, 4) " | " \ MSKNSHR(xa, 0x20, 5) ")") /* Macros to normalize 1 or 10 VSX registers */ #define NORM(x) XVCPSGNDP(x, x, x) #define NORM10(x) \ NORM(x ## 0); NORM(x ## 1); NORM(x ## 2); NORM(x ## 3); NORM(x ## 4); \ NORM(x ## 5); NORM(x ## 6); NORM(x ## 7); NORM(x ## 8); NORM(x ## 9) static void normalize_inputs(void) { unsigned long msr; /* enable VSX */ msr = mfmsr(); mtmsr(msr | PSL_VSX); NORM(0); NORM(1); NORM(2); NORM(3); NORM(4); NORM(5); NORM(6); NORM(7); NORM(8); NORM(9); NORM10(1); NORM10(2); NORM10(3); NORM10(4); NORM10(5); NORM(60); NORM(61); NORM(62); NORM(63); /* restore MSR */ mtmsr(msr); } #endif #ifdef KDB int db_trap_glue(struct trapframe *frame) { if (!(frame->srr1 & PSL_PR) && (frame->exc == EXC_TRC || frame->exc == EXC_RUNMODETRC || frame_is_trap_inst(frame) || frame->exc == EXC_BPT || frame->exc == EXC_DEBUG || frame->exc == EXC_DSI)) { int type = frame->exc; /* Ignore DTrace traps. */ if (*(uint32_t *)frame->srr0 == EXC_DTRACE) return (0); if (frame_is_trap_inst(frame)) { type = T_BREAKPOINT; } return (kdb_trap(type, 0, frame)); } return (0); } #endif Index: head/sys/vm/vm_fault.c =================================================================== --- head/sys/vm/vm_fault.c (revision 360886) +++ head/sys/vm/vm_fault.c (revision 360887) @@ -1,2014 +1,2014 @@ /*- * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Page fault handling module. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #include #define PFBAK 4 #define PFFOR 4 #define VM_FAULT_READ_DEFAULT (1 + VM_FAULT_READ_AHEAD_INIT) #define VM_FAULT_READ_MAX (1 + VM_FAULT_READ_AHEAD_MAX) #define VM_FAULT_DONTNEED_MIN 1048576 struct faultstate { /* Fault parameters. */ vm_offset_t vaddr; vm_page_t *m_hold; vm_prot_t fault_type; vm_prot_t prot; int fault_flags; int oom; boolean_t wired; /* Page reference for cow. */ vm_page_t m_cow; /* Current object. */ vm_object_t object; vm_pindex_t pindex; vm_page_t m; /* Top-level map object. */ vm_object_t first_object; vm_pindex_t first_pindex; vm_page_t first_m; /* Map state. */ vm_map_t map; vm_map_entry_t entry; int map_generation; bool lookup_still_valid; /* Vnode if locked. */ struct vnode *vp; }; static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead); static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int backward, int forward, bool obj_locked); static int vm_pfault_oom_attempts = 3; SYSCTL_INT(_vm, OID_AUTO, pfault_oom_attempts, CTLFLAG_RWTUN, &vm_pfault_oom_attempts, 0, "Number of page allocation attempts in page fault handler before it " "triggers OOM handling"); static int vm_pfault_oom_wait = 10; SYSCTL_INT(_vm, OID_AUTO, pfault_oom_wait, CTLFLAG_RWTUN, &vm_pfault_oom_wait, 0, "Number of seconds to wait for free pages before retrying " "the page fault handler"); static inline void fault_page_release(vm_page_t *mp) { vm_page_t m; m = *mp; if (m != NULL) { /* * We are likely to loop around again and attempt to busy * this page. Deactivating it leaves it available for * pageout while optimizing fault restarts. */ vm_page_deactivate(m); vm_page_xunbusy(m); *mp = NULL; } } static inline void fault_page_free(vm_page_t *mp) { vm_page_t m; m = *mp; if (m != NULL) { VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_wired(m)) vm_page_free(m); else vm_page_xunbusy(m); *mp = NULL; } } static inline void unlock_map(struct faultstate *fs) { if (fs->lookup_still_valid) { vm_map_lookup_done(fs->map, fs->entry); fs->lookup_still_valid = false; } } static void unlock_vp(struct faultstate *fs) { if (fs->vp != NULL) { vput(fs->vp); fs->vp = NULL; } } static void fault_deallocate(struct faultstate *fs) { fault_page_release(&fs->m_cow); fault_page_release(&fs->m); vm_object_pip_wakeup(fs->object); if (fs->object != fs->first_object) { VM_OBJECT_WLOCK(fs->first_object); fault_page_free(&fs->first_m); VM_OBJECT_WUNLOCK(fs->first_object); vm_object_pip_wakeup(fs->first_object); } vm_object_deallocate(fs->first_object); unlock_map(fs); unlock_vp(fs); } static void unlock_and_deallocate(struct faultstate *fs) { VM_OBJECT_WUNLOCK(fs->object); fault_deallocate(fs); } static void vm_fault_dirty(struct faultstate *fs, vm_page_t m) { bool need_dirty; if (((fs->prot & VM_PROT_WRITE) == 0 && (fs->fault_flags & VM_FAULT_DIRTY) == 0) || (m->oflags & VPO_UNMANAGED) != 0) return; VM_PAGE_OBJECT_BUSY_ASSERT(m); need_dirty = ((fs->fault_type & VM_PROT_WRITE) != 0 && (fs->fault_flags & VM_FAULT_WIRE) == 0) || (fs->fault_flags & VM_FAULT_DIRTY) != 0; vm_object_set_writeable_dirty(m->object); /* * If the fault is a write, we know that this page is being * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * * Also, since the page is now dirty, we can possibly tell * the pager to release any swap backing the page. */ if (need_dirty && vm_page_set_dirty(m) == 0) { /* * If this is a NOSYNC mmap we do not want to set PGA_NOSYNC * if the page is already dirty to prevent data written with * the expectation of being synced from not being synced. * Likewise if this entry does not request NOSYNC then make * sure the page isn't marked NOSYNC. Applications sharing * data should use the same flags to avoid ping ponging. */ if ((fs->entry->eflags & MAP_ENTRY_NOSYNC) != 0) vm_page_aflag_set(m, PGA_NOSYNC); else vm_page_aflag_clear(m, PGA_NOSYNC); } } /* * Unlocks fs.first_object and fs.map on success. */ static int vm_fault_soft_fast(struct faultstate *fs) { vm_page_t m, m_map; -#if (defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ +#if (defined(__aarch64__) || defined(__amd64__) || defined(__powerpc64__) || (defined(__arm__) && \ __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv)) && \ VM_NRESERVLEVEL > 0 vm_page_t m_super; int flags; #endif int psind, rv; vm_offset_t vaddr; MPASS(fs->vp == NULL); vaddr = fs->vaddr; vm_object_busy(fs->first_object); m = vm_page_lookup(fs->first_object, fs->first_pindex); /* A busy page can be mapped for read|execute access. */ if (m == NULL || ((fs->prot & VM_PROT_WRITE) != 0 && vm_page_busied(m)) || !vm_page_all_valid(m)) { rv = KERN_FAILURE; goto out; } m_map = m; psind = 0; -#if (defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ +#if (defined(__aarch64__) || defined(__amd64__) || defined(__powerpc64__) || (defined(__arm__) && \ __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv)) && \ VM_NRESERVLEVEL > 0 if ((m->flags & PG_FICTITIOUS) == 0 && (m_super = vm_reserv_to_superpage(m)) != NULL && rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start && roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end && (vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) & (pagesizes[m_super->psind] - 1)) && !fs->wired && pmap_ps_enabled(fs->map->pmap)) { flags = PS_ALL_VALID; if ((fs->prot & VM_PROT_WRITE) != 0) { /* * Create a superpage mapping allowing write access * only if none of the constituent pages are busy and * all of them are already dirty (except possibly for * the page that was faulted on). */ flags |= PS_NONE_BUSY; if ((fs->first_object->flags & OBJ_UNMANAGED) == 0) flags |= PS_ALL_DIRTY; } if (vm_page_ps_test(m_super, flags, m)) { m_map = m_super; psind = m_super->psind; vaddr = rounddown2(vaddr, pagesizes[psind]); /* Preset the modified bit for dirty superpages. */ if ((flags & PS_ALL_DIRTY) != 0) fs->fault_type |= VM_PROT_WRITE; } } #endif rv = pmap_enter(fs->map->pmap, vaddr, m_map, fs->prot, fs->fault_type | PMAP_ENTER_NOSLEEP | (fs->wired ? PMAP_ENTER_WIRED : 0), psind); if (rv != KERN_SUCCESS) goto out; if (fs->m_hold != NULL) { (*fs->m_hold) = m; vm_page_wire(m); } if (psind == 0 && !fs->wired) vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true); VM_OBJECT_RUNLOCK(fs->first_object); vm_fault_dirty(fs, m); vm_map_lookup_done(fs->map, fs->entry); curthread->td_ru.ru_minflt++; out: vm_object_unbusy(fs->first_object); return (rv); } static void vm_fault_restore_map_lock(struct faultstate *fs) { VM_OBJECT_ASSERT_WLOCKED(fs->first_object); MPASS(blockcount_read(&fs->first_object->paging_in_progress) > 0); if (!vm_map_trylock_read(fs->map)) { VM_OBJECT_WUNLOCK(fs->first_object); vm_map_lock_read(fs->map); VM_OBJECT_WLOCK(fs->first_object); } fs->lookup_still_valid = true; } static void vm_fault_populate_check_page(vm_page_t m) { /* * Check each page to ensure that the pager is obeying the * interface: the page must be installed in the object, fully * valid, and exclusively busied. */ MPASS(m != NULL); MPASS(vm_page_all_valid(m)); MPASS(vm_page_xbusied(m)); } static void vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first, vm_pindex_t last) { vm_page_t m; vm_pindex_t pidx; VM_OBJECT_ASSERT_WLOCKED(object); MPASS(first <= last); for (pidx = first, m = vm_page_lookup(object, pidx); pidx <= last; pidx++, m = vm_page_next(m)) { vm_fault_populate_check_page(m); vm_page_deactivate(m); vm_page_xunbusy(m); } } static int vm_fault_populate(struct faultstate *fs) { vm_offset_t vaddr; vm_page_t m; vm_pindex_t map_first, map_last, pager_first, pager_last, pidx; int i, npages, psind, rv; MPASS(fs->object == fs->first_object); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); MPASS(blockcount_read(&fs->first_object->paging_in_progress) > 0); MPASS(fs->first_object->backing_object == NULL); MPASS(fs->lookup_still_valid); pager_first = OFF_TO_IDX(fs->entry->offset); pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1; unlock_map(fs); unlock_vp(fs); /* * Call the pager (driver) populate() method. * * There is no guarantee that the method will be called again * if the current fault is for read, and a future fault is * for write. Report the entry's maximum allowed protection * to the driver. */ rv = vm_pager_populate(fs->first_object, fs->first_pindex, fs->fault_type, fs->entry->max_protection, &pager_first, &pager_last); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); if (rv == VM_PAGER_BAD) { /* * VM_PAGER_BAD is the backdoor for a pager to request * normal fault handling. */ vm_fault_restore_map_lock(fs); if (fs->map->timestamp != fs->map_generation) return (KERN_RESTART); return (KERN_NOT_RECEIVER); } if (rv != VM_PAGER_OK) return (KERN_FAILURE); /* AKA SIGSEGV */ /* Ensure that the driver is obeying the interface. */ MPASS(pager_first <= pager_last); MPASS(fs->first_pindex <= pager_last); MPASS(fs->first_pindex >= pager_first); MPASS(pager_last < fs->first_object->size); vm_fault_restore_map_lock(fs); if (fs->map->timestamp != fs->map_generation) { vm_fault_populate_cleanup(fs->first_object, pager_first, pager_last); return (KERN_RESTART); } /* * The map is unchanged after our last unlock. Process the fault. * * The range [pager_first, pager_last] that is given to the * pager is only a hint. The pager may populate any range * within the object that includes the requested page index. * In case the pager expanded the range, clip it to fit into * the map entry. */ map_first = OFF_TO_IDX(fs->entry->offset); if (map_first > pager_first) { vm_fault_populate_cleanup(fs->first_object, pager_first, map_first - 1); pager_first = map_first; } map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1; if (map_last < pager_last) { vm_fault_populate_cleanup(fs->first_object, map_last + 1, pager_last); pager_last = map_last; } for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx); pidx <= pager_last; pidx += npages, m = vm_page_next(&m[npages - 1])) { vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset; #if defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv) psind = m->psind; if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 || pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last || !pmap_ps_enabled(fs->map->pmap) || fs->wired)) psind = 0; #else psind = 0; #endif npages = atop(pagesizes[psind]); for (i = 0; i < npages; i++) { vm_fault_populate_check_page(&m[i]); vm_fault_dirty(fs, &m[i]); } VM_OBJECT_WUNLOCK(fs->first_object); rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot, fs->fault_type | (fs->wired ? PMAP_ENTER_WIRED : 0), psind); #if defined(__amd64__) if (psind > 0 && rv == KERN_FAILURE) { for (i = 0; i < npages; i++) { rv = pmap_enter(fs->map->pmap, vaddr + ptoa(i), &m[i], fs->prot, fs->fault_type | (fs->wired ? PMAP_ENTER_WIRED : 0), 0); MPASS(rv == KERN_SUCCESS); } } #else MPASS(rv == KERN_SUCCESS); #endif VM_OBJECT_WLOCK(fs->first_object); for (i = 0; i < npages; i++) { if ((fs->fault_flags & VM_FAULT_WIRE) != 0) vm_page_wire(&m[i]); else vm_page_activate(&m[i]); if (fs->m_hold != NULL && m[i].pindex == fs->first_pindex) { (*fs->m_hold) = &m[i]; vm_page_wire(&m[i]); } vm_page_xunbusy(&m[i]); } } curthread->td_ru.ru_majflt++; return (KERN_SUCCESS); } static int prot_fault_translation; SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN, &prot_fault_translation, 0, "Control signal to deliver on protection fault"); /* compat definition to keep common code for signal translation */ #define UCODE_PAGEFLT 12 #ifdef T_PAGEFLT _Static_assert(UCODE_PAGEFLT == T_PAGEFLT, "T_PAGEFLT"); #endif /* * vm_fault_trap: * * Handle a page fault occurring at the given address, * requiring the given permissions, in the map specified. * If successful, the page is inserted into the * associated physical map. * * NOTE: the given address should be truncated to the * proper page address. * * KERN_SUCCESS is returned if the page fault is handled; otherwise, * a standard error specifying why the fault is fatal is returned. * * The map in question must be referenced, and remains so. * Caller may hold no locks. */ int vm_fault_trap(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, int *signo, int *ucode) { int result; MPASS(signo == NULL || ucode != NULL); #ifdef KTRACE if (map != kernel_map && KTRPOINT(curthread, KTR_FAULT)) ktrfault(vaddr, fault_type); #endif result = vm_fault(map, trunc_page(vaddr), fault_type, fault_flags, NULL); KASSERT(result == KERN_SUCCESS || result == KERN_FAILURE || result == KERN_INVALID_ADDRESS || result == KERN_RESOURCE_SHORTAGE || result == KERN_PROTECTION_FAILURE || result == KERN_OUT_OF_BOUNDS, ("Unexpected Mach error %d from vm_fault()", result)); #ifdef KTRACE if (map != kernel_map && KTRPOINT(curthread, KTR_FAULTEND)) ktrfaultend(result); #endif if (result != KERN_SUCCESS && signo != NULL) { switch (result) { case KERN_FAILURE: case KERN_INVALID_ADDRESS: *signo = SIGSEGV; *ucode = SEGV_MAPERR; break; case KERN_RESOURCE_SHORTAGE: *signo = SIGBUS; *ucode = BUS_OOMERR; break; case KERN_OUT_OF_BOUNDS: *signo = SIGBUS; *ucode = BUS_OBJERR; break; case KERN_PROTECTION_FAILURE: if (prot_fault_translation == 0) { /* * Autodetect. This check also covers * the images without the ABI-tag ELF * note. */ if (SV_CURPROC_ABI() == SV_ABI_FREEBSD && curproc->p_osrel >= P_OSREL_SIGSEGV) { *signo = SIGSEGV; *ucode = SEGV_ACCERR; } else { *signo = SIGBUS; *ucode = UCODE_PAGEFLT; } } else if (prot_fault_translation == 1) { /* Always compat mode. */ *signo = SIGBUS; *ucode = UCODE_PAGEFLT; } else { /* Always SIGSEGV mode. */ *signo = SIGSEGV; *ucode = SEGV_ACCERR; } break; default: KASSERT(0, ("Unexpected Mach error %d from vm_fault()", result)); break; } } return (result); } static int vm_fault_lock_vnode(struct faultstate *fs, bool objlocked) { struct vnode *vp; int error, locked; if (fs->object->type != OBJT_VNODE) return (KERN_SUCCESS); vp = fs->object->handle; if (vp == fs->vp) { ASSERT_VOP_LOCKED(vp, "saved vnode is not locked"); return (KERN_SUCCESS); } /* * Perform an unlock in case the desired vnode changed while * the map was unlocked during a retry. */ unlock_vp(fs); locked = VOP_ISLOCKED(vp); if (locked != LK_EXCLUSIVE) locked = LK_SHARED; /* * We must not sleep acquiring the vnode lock while we have * the page exclusive busied or the object's * paging-in-progress count incremented. Otherwise, we could * deadlock. */ error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); if (error == 0) { fs->vp = vp; return (KERN_SUCCESS); } vhold(vp); if (objlocked) unlock_and_deallocate(fs); else fault_deallocate(fs); error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread); vdrop(vp); fs->vp = vp; KASSERT(error == 0, ("vm_fault: vget failed %d", error)); return (KERN_RESOURCE_SHORTAGE); } /* * Calculate the desired readahead. Handle drop-behind. * * Returns the number of readahead blocks to pass to the pager. */ static int vm_fault_readahead(struct faultstate *fs) { int era, nera; u_char behavior; KASSERT(fs->lookup_still_valid, ("map unlocked")); era = fs->entry->read_ahead; behavior = vm_map_entry_behavior(fs->entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM) { nera = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { nera = VM_FAULT_READ_AHEAD_MAX; if (fs->vaddr == fs->entry->next_read) vm_fault_dontneed(fs, fs->vaddr, nera); } else if (fs->vaddr == fs->entry->next_read) { /* * This is a sequential fault. Arithmetically * increase the requested number of pages in * the read-ahead window. The requested * number of pages is "# of sequential faults * x (read ahead min + 1) + read ahead min" */ nera = VM_FAULT_READ_AHEAD_MIN; if (era > 0) { nera += era + 1; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; } if (era == VM_FAULT_READ_AHEAD_MAX) vm_fault_dontneed(fs, fs->vaddr, nera); } else { /* * This is a non-sequential fault. */ nera = 0; } if (era != nera) { /* * A read lock on the map suffices to update * the read ahead count safely. */ fs->entry->read_ahead = nera; } return (nera); } static int vm_fault_lookup(struct faultstate *fs) { int result; KASSERT(!fs->lookup_still_valid, ("vm_fault_lookup: Map already locked.")); result = vm_map_lookup(&fs->map, fs->vaddr, fs->fault_type | VM_PROT_FAULT_LOOKUP, &fs->entry, &fs->first_object, &fs->first_pindex, &fs->prot, &fs->wired); if (result != KERN_SUCCESS) { unlock_vp(fs); return (result); } fs->map_generation = fs->map->timestamp; if (fs->entry->eflags & MAP_ENTRY_NOFAULT) { panic("%s: fault on nofault entry, addr: %#lx", __func__, (u_long)fs->vaddr); } if (fs->entry->eflags & MAP_ENTRY_IN_TRANSITION && fs->entry->wiring_thread != curthread) { vm_map_unlock_read(fs->map); vm_map_lock(fs->map); if (vm_map_lookup_entry(fs->map, fs->vaddr, &fs->entry) && (fs->entry->eflags & MAP_ENTRY_IN_TRANSITION)) { unlock_vp(fs); fs->entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; vm_map_unlock_and_wait(fs->map, 0); } else vm_map_unlock(fs->map); return (KERN_RESOURCE_SHORTAGE); } MPASS((fs->entry->eflags & MAP_ENTRY_GUARD) == 0); if (fs->wired) fs->fault_type = fs->prot | (fs->fault_type & VM_PROT_COPY); else KASSERT((fs->fault_flags & VM_FAULT_WIRE) == 0, ("!fs->wired && VM_FAULT_WIRE")); fs->lookup_still_valid = true; return (KERN_SUCCESS); } static int vm_fault_relookup(struct faultstate *fs) { vm_object_t retry_object; vm_pindex_t retry_pindex; vm_prot_t retry_prot; int result; if (!vm_map_trylock_read(fs->map)) return (KERN_RESTART); fs->lookup_still_valid = true; if (fs->map->timestamp == fs->map_generation) return (KERN_SUCCESS); result = vm_map_lookup_locked(&fs->map, fs->vaddr, fs->fault_type, &fs->entry, &retry_object, &retry_pindex, &retry_prot, &fs->wired); if (result != KERN_SUCCESS) { /* * If retry of map lookup would have blocked then * retry fault from start. */ if (result == KERN_FAILURE) return (KERN_RESTART); return (result); } if (retry_object != fs->first_object || retry_pindex != fs->first_pindex) return (KERN_RESTART); /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ fs->prot &= retry_prot; fs->fault_type &= retry_prot; if (fs->prot == 0) return (KERN_RESTART); /* Reassert because wired may have changed. */ KASSERT(fs->wired || (fs->fault_flags & VM_FAULT_WIRE) == 0, ("!wired && VM_FAULT_WIRE")); return (KERN_SUCCESS); } static void vm_fault_cow(struct faultstate *fs) { bool is_first_object_locked; /* * This allows pages to be virtually copied from a backing_object * into the first_object, where the backing object has no other * refs to it, and cannot gain any more refs. Instead of a bcopy, * we just move the page from the backing object to the first * object. Note that we must mark the page dirty in the first * object so that it will go out to swap when needed. */ is_first_object_locked = false; if ( /* * Only one shadow object and no other refs. */ fs->object->shadow_count == 1 && fs->object->ref_count == 1 && /* * No other ways to look the object up */ fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0 && /* * We don't chase down the shadow chain and we can acquire locks. */ (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object)) && fs->object == fs->first_object->backing_object && VM_OBJECT_TRYWLOCK(fs->object)) { /* * Remove but keep xbusy for replace. fs->m is moved into * fs->first_object and left busy while fs->first_m is * conditionally freed. */ vm_page_remove_xbusy(fs->m); vm_page_replace(fs->m, fs->first_object, fs->first_pindex, fs->first_m); vm_page_dirty(fs->m); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(fs->m, fs->first_object, fs->object, OFF_TO_IDX(fs->first_object->backing_object_offset)); #endif VM_OBJECT_WUNLOCK(fs->object); VM_OBJECT_WUNLOCK(fs->first_object); fs->first_m = fs->m; fs->m = NULL; VM_CNT_INC(v_cow_optim); } else { if (is_first_object_locked) VM_OBJECT_WUNLOCK(fs->first_object); /* * Oh, well, lets copy it. */ pmap_copy_page(fs->m, fs->first_m); vm_page_valid(fs->first_m); if (fs->wired && (fs->fault_flags & VM_FAULT_WIRE) == 0) { vm_page_wire(fs->first_m); vm_page_unwire(fs->m, PQ_INACTIVE); } /* * Save the cow page to be released after * pmap_enter is complete. */ fs->m_cow = fs->m; fs->m = NULL; } /* * fs->object != fs->first_object due to above * conditional */ vm_object_pip_wakeup(fs->object); /* * Only use the new page below... */ fs->object = fs->first_object; fs->pindex = fs->first_pindex; fs->m = fs->first_m; VM_CNT_INC(v_cow_faults); curthread->td_cow++; } static bool vm_fault_next(struct faultstate *fs) { vm_object_t next_object; /* * The requested page does not exist at this object/ * offset. Remove the invalid page from the object, * waking up anyone waiting for it, and continue on to * the next object. However, if this is the top-level * object, we must leave the busy page in place to * prevent another process from rushing past us, and * inserting the page in that object at the same time * that we are. */ if (fs->object == fs->first_object) { fs->first_m = fs->m; fs->m = NULL; } else fault_page_free(&fs->m); /* * Move on to the next object. Lock the next object before * unlocking the current one. */ VM_OBJECT_ASSERT_WLOCKED(fs->object); next_object = fs->object->backing_object; if (next_object == NULL) return (false); MPASS(fs->first_m != NULL); KASSERT(fs->object != next_object, ("object loop %p", next_object)); VM_OBJECT_WLOCK(next_object); vm_object_pip_add(next_object, 1); if (fs->object != fs->first_object) vm_object_pip_wakeup(fs->object); fs->pindex += OFF_TO_IDX(fs->object->backing_object_offset); VM_OBJECT_WUNLOCK(fs->object); fs->object = next_object; return (true); } static void vm_fault_zerofill(struct faultstate *fs) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs->object != fs->first_object) { vm_object_pip_wakeup(fs->object); fs->object = fs->first_object; fs->pindex = fs->first_pindex; } MPASS(fs->first_m != NULL); MPASS(fs->m == NULL); fs->m = fs->first_m; fs->first_m = NULL; /* * Zero the page if necessary and mark it valid. */ if ((fs->m->flags & PG_ZERO) == 0) { pmap_zero_page(fs->m); } else { VM_CNT_INC(v_ozfod); } VM_CNT_INC(v_zfod); vm_page_valid(fs->m); } /* * Allocate a page directly or via the object populate method. */ static int vm_fault_allocate(struct faultstate *fs) { struct domainset *dset; int alloc_req; int rv; if ((fs->object->flags & OBJ_SIZEVNLOCK) != 0) { rv = vm_fault_lock_vnode(fs, true); MPASS(rv == KERN_SUCCESS || rv == KERN_RESOURCE_SHORTAGE); if (rv == KERN_RESOURCE_SHORTAGE) return (rv); } if (fs->pindex >= fs->object->size) return (KERN_OUT_OF_BOUNDS); if (fs->object == fs->first_object && (fs->first_object->flags & OBJ_POPULATE) != 0 && fs->first_object->shadow_count == 0) { rv = vm_fault_populate(fs); switch (rv) { case KERN_SUCCESS: case KERN_FAILURE: case KERN_RESTART: return (rv); case KERN_NOT_RECEIVER: /* * Pager's populate() method * returned VM_PAGER_BAD. */ break; default: panic("inconsistent return codes"); } } /* * Allocate a new page for this object/offset pair. * * Unlocked read of the p_flag is harmless. At worst, the P_KILLED * might be not observed there, and allocation can fail, causing * restart and new reading of the p_flag. */ dset = fs->object->domain.dr_policy; if (dset == NULL) dset = curthread->td_domain.dr_policy; if (!vm_page_count_severe_set(&dset->ds_mask) || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 vm_object_color(fs->object, atop(fs->vaddr) - fs->pindex); #endif alloc_req = P_KILLED(curproc) ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; if (fs->object->type != OBJT_VNODE && fs->object->backing_object == NULL) alloc_req |= VM_ALLOC_ZERO; fs->m = vm_page_alloc(fs->object, fs->pindex, alloc_req); } if (fs->m == NULL) { unlock_and_deallocate(fs); if (vm_pfault_oom_attempts < 0 || fs->oom < vm_pfault_oom_attempts) { fs->oom++; vm_waitpfault(dset, vm_pfault_oom_wait * hz); } else { if (bootverbose) printf( "proc %d (%s) failed to alloc page on fault, starting OOM\n", curproc->p_pid, curproc->p_comm); vm_pageout_oom(VM_OOM_MEM_PF); fs->oom = 0; } return (KERN_RESOURCE_SHORTAGE); } fs->oom = 0; return (KERN_NOT_RECEIVER); } /* * Call the pager to retrieve the page if there is a chance * that the pager has it, and potentially retrieve additional * pages at the same time. */ static int vm_fault_getpages(struct faultstate *fs, int nera, int *behindp, int *aheadp) { vm_offset_t e_end, e_start; int ahead, behind, cluster_offset, rv; u_char behavior; /* * Prepare for unlocking the map. Save the map * entry's start and end addresses, which are used to * optimize the size of the pager operation below. * Even if the map entry's addresses change after * unlocking the map, using the saved addresses is * safe. */ e_start = fs->entry->start; e_end = fs->entry->end; behavior = vm_map_entry_behavior(fs->entry); /* * Release the map lock before locking the vnode or * sleeping in the pager. (If the current object has * a shadow, then an earlier iteration of this loop * may have already unlocked the map.) */ unlock_map(fs); rv = vm_fault_lock_vnode(fs, false); MPASS(rv == KERN_SUCCESS || rv == KERN_RESOURCE_SHORTAGE); if (rv == KERN_RESOURCE_SHORTAGE) return (rv); KASSERT(fs->vp == NULL || !fs->map->system_map, ("vm_fault: vnode-backed object mapped by system map")); /* * Page in the requested page and hint the pager, * that it may bring up surrounding pages. */ if (nera == -1 || behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { behind = 0; ahead = 0; } else { /* Is this a sequential fault? */ if (nera > 0) { behind = 0; ahead = nera; } else { /* * Request a cluster of pages that is * aligned to a VM_FAULT_READ_DEFAULT * page offset boundary within the * object. Alignment to a page offset * boundary is more likely to coincide * with the underlying file system * block than alignment to a virtual * address boundary. */ cluster_offset = fs->pindex % VM_FAULT_READ_DEFAULT; behind = ulmin(cluster_offset, atop(fs->vaddr - e_start)); ahead = VM_FAULT_READ_DEFAULT - 1 - cluster_offset; } ahead = ulmin(ahead, atop(e_end - fs->vaddr) - 1); } *behindp = behind; *aheadp = ahead; rv = vm_pager_get_pages(fs->object, &fs->m, 1, behindp, aheadp); if (rv == VM_PAGER_OK) return (KERN_SUCCESS); if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * If an I/O error occurred or the requested page was * outside the range of the pager, clean up and return * an error. */ if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) return (KERN_OUT_OF_BOUNDS); return (KERN_NOT_RECEIVER); } /* * Wait/Retry if the page is busy. We have to do this if the page is * either exclusive or shared busy because the vm_pager may be using * read busy for pageouts (and even pageins if it is the vnode pager), * and we could end up trying to pagein and pageout the same page * simultaneously. * * We can theoretically allow the busy case on a read fault if the page * is marked valid, but since such pages are typically already pmap'd, * putting that special case in might be more effort then it is worth. * We cannot under any circumstances mess around with a shared busied * page except, perhaps, to pmap it. */ static void vm_fault_busy_sleep(struct faultstate *fs) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(fs->m, PGA_REFERENCED); if (fs->object != fs->first_object) { fault_page_release(&fs->first_m); vm_object_pip_wakeup(fs->first_object); } vm_object_pip_wakeup(fs->object); unlock_map(fs); if (fs->m == vm_page_lookup(fs->object, fs->pindex)) vm_page_busy_sleep(fs->m, "vmpfw", false); else VM_OBJECT_WUNLOCK(fs->object); VM_CNT_INC(v_intrans); vm_object_deallocate(fs->first_object); } int vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { struct faultstate fs; int ahead, behind, faultcount; int nera, result, rv; bool dead, hardfault; VM_CNT_INC(v_vm_faults); if ((curthread->td_pflags & TDP_NOFAULTING) != 0) return (KERN_PROTECTION_FAILURE); fs.vp = NULL; fs.vaddr = vaddr; fs.m_hold = m_hold; fs.fault_flags = fault_flags; fs.map = map; fs.lookup_still_valid = false; fs.oom = 0; faultcount = 0; nera = -1; hardfault = false; RetryFault: fs.fault_type = fault_type; /* * Find the backing store object and offset into it to begin the * search. */ result = vm_fault_lookup(&fs); if (result != KERN_SUCCESS) { if (result == KERN_RESOURCE_SHORTAGE) goto RetryFault; return (result); } /* * Try to avoid lock contention on the top-level object through * special-case handling of some types of page faults, specifically, * those that are mapping an existing page from the top-level object. * Under this condition, a read lock on the object suffices, allowing * multiple page faults of a similar type to run in parallel. */ if (fs.vp == NULL /* avoid locked vnode leak */ && (fs.fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0) { VM_OBJECT_RLOCK(fs.first_object); rv = vm_fault_soft_fast(&fs); if (rv == KERN_SUCCESS) return (rv); if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) { VM_OBJECT_RUNLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.first_object); } } else { VM_OBJECT_WLOCK(fs.first_object); } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. */ vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.m_cow = fs.m = fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; while (TRUE) { KASSERT(fs.m == NULL, ("page still set %p at loop start", fs.m)); /* * If the object is marked for imminent termination, * we retry here, since the collapse pass has raced * with us. Otherwise, if we see terminally dead * object, return fail. */ if ((fs.object->flags & OBJ_DEAD) != 0) { dead = fs.object->type == OBJT_DEAD; unlock_and_deallocate(&fs); if (dead) return (KERN_PROTECTION_FAILURE); pause("vmf_de", 1); goto RetryFault; } /* * See if page is resident */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { if (vm_page_tryxbusy(fs.m) == 0) { vm_fault_busy_sleep(&fs); goto RetryFault; } /* * The page is marked busy for other processes and the * pagedaemon. If it still is completely valid we * are done. */ if (vm_page_all_valid(fs.m)) { VM_OBJECT_WUNLOCK(fs.object); break; /* break to PAGE HAS BEEN FOUND. */ } } VM_OBJECT_ASSERT_WLOCKED(fs.object); /* * Page is not resident. If the pager might contain the page * or this is the beginning of the search, allocate a new * page. (Default objects are zero-fill, so there is no real * pager for them.) */ if (fs.m == NULL && (fs.object->type != OBJT_DEFAULT || fs.object == fs.first_object)) { rv = vm_fault_allocate(&fs); switch (rv) { case KERN_RESTART: unlock_and_deallocate(&fs); /* FALLTHROUGH */ case KERN_RESOURCE_SHORTAGE: goto RetryFault; case KERN_SUCCESS: case KERN_FAILURE: case KERN_OUT_OF_BOUNDS: unlock_and_deallocate(&fs); return (rv); case KERN_NOT_RECEIVER: break; default: panic("vm_fault: Unhandled rv %d", rv); } } /* * Default objects have no pager so no exclusive busy exists * to protect this page in the chain. Skip to the next * object without dropping the lock to preserve atomicity of * shadow faults. */ if (fs.object->type != OBJT_DEFAULT) { /* * At this point, we have either allocated a new page * or found an existing page that is only partially * valid. * * We hold a reference on the current object and the * page is exclusive busied. The exclusive busy * prevents simultaneous faults and collapses while * the object lock is dropped. */ VM_OBJECT_WUNLOCK(fs.object); /* * If the pager for the current object might have * the page, then determine the number of additional * pages to read and potentially reprioritize * previously read pages for earlier reclamation. * These operations should only be performed once per * page fault. Even if the current pager doesn't * have the page, the number of additional pages to * read will apply to subsequent objects in the * shadow chain. */ if (nera == -1 && !P_KILLED(curproc)) nera = vm_fault_readahead(&fs); rv = vm_fault_getpages(&fs, nera, &behind, &ahead); if (rv == KERN_SUCCESS) { faultcount = behind + 1 + ahead; hardfault = true; break; /* break to PAGE HAS BEEN FOUND. */ } if (rv == KERN_RESOURCE_SHORTAGE) goto RetryFault; VM_OBJECT_WLOCK(fs.object); if (rv == KERN_OUT_OF_BOUNDS) { fault_page_free(&fs.m); unlock_and_deallocate(&fs); return (rv); } } /* * The page was not found in the current object. Try to * traverse into a backing object or zero fill if none is * found. */ if (vm_fault_next(&fs)) continue; VM_OBJECT_WUNLOCK(fs.object); vm_fault_zerofill(&fs); /* Don't try to prefault neighboring pages. */ faultcount = 1; break; /* break to PAGE HAS BEEN FOUND. */ } /* * PAGE HAS BEEN FOUND. A valid page has been found and exclusively * busied. The object lock must no longer be held. */ vm_page_assert_xbusied(fs.m); VM_OBJECT_ASSERT_UNLOCKED(fs.object); /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if ((fs.fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { vm_fault_cow(&fs); /* * We only try to prefault read-only mappings to the * neighboring pages when this copy-on-write fault is * a hard fault. In other cases, trying to prefault * is typically wasted effort. */ if (faultcount == 0) faultcount = 1; } else { fs.prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid) { result = vm_fault_relookup(&fs); if (result != KERN_SUCCESS) { fault_deallocate(&fs); if (result == KERN_RESTART) goto RetryFault; return (result); } } VM_OBJECT_ASSERT_UNLOCKED(fs.object); /* * If the page was filled by a pager, save the virtual address that * should be faulted on next under a sequential access pattern to the * map entry. A read lock on the map suffices to update this address * safely. */ if (hardfault) fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE; /* * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ vm_page_assert_xbusied(fs.m); KASSERT(vm_page_all_valid(fs.m), ("vm_fault: page %p partially invalid", fs.m)); vm_fault_dirty(&fs, fs.m); /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter() may sleep. We don't put the page * back on the active queue until later so that the pageout daemon * won't find it (yet). */ pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.fault_type | (fs.wired ? PMAP_ENTER_WIRED : 0), 0); if (faultcount != 1 && (fs.fault_flags & VM_FAULT_WIRE) == 0 && fs.wired == 0) vm_fault_prefault(&fs, vaddr, faultcount > 0 ? behind : PFBAK, faultcount > 0 ? ahead : PFFOR, false); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if ((fs.fault_flags & VM_FAULT_WIRE) != 0) vm_page_wire(fs.m); else vm_page_activate(fs.m); if (fs.m_hold != NULL) { (*fs.m_hold) = fs.m; vm_page_wire(fs.m); } vm_page_xunbusy(fs.m); fs.m = NULL; /* * Unlock everything, and return */ fault_deallocate(&fs); if (hardfault) { VM_CNT_INC(v_io_faults); curthread->td_ru.ru_majflt++; #ifdef RACCT if (racct_enable && fs.object->type == OBJT_VNODE) { PROC_LOCK(curproc); if ((fs.fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { racct_add_force(curproc, RACCT_WRITEBPS, PAGE_SIZE + behind * PAGE_SIZE); racct_add_force(curproc, RACCT_WRITEIOPS, 1); } else { racct_add_force(curproc, RACCT_READBPS, PAGE_SIZE + ahead * PAGE_SIZE); racct_add_force(curproc, RACCT_READIOPS, 1); } PROC_UNLOCK(curproc); } #endif } else curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); } /* * Speed up the reclamation of pages that precede the faulting pindex within * the first object of the shadow chain. Essentially, perform the equivalent * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes * the faulting pindex by the cluster size when the pages read by vm_fault() * cross a cluster-size boundary. The cluster size is the greater of the * smallest superpage size and VM_FAULT_DONTNEED_MIN. * * When "fs->first_object" is a shadow object, the pages in the backing object * that precede the faulting pindex are deactivated by vm_fault(). So, this * function must only be concerned with pages in the first object. */ static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead) { vm_map_entry_t entry; vm_object_t first_object, object; vm_offset_t end, start; vm_page_t m, m_next; vm_pindex_t pend, pstart; vm_size_t size; object = fs->object; VM_OBJECT_ASSERT_UNLOCKED(object); first_object = fs->first_object; /* Neither fictitious nor unmanaged pages can be reclaimed. */ if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) { VM_OBJECT_RLOCK(first_object); size = VM_FAULT_DONTNEED_MIN; if (MAXPAGESIZES > 1 && size < pagesizes[1]) size = pagesizes[1]; end = rounddown2(vaddr, size); if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) && (entry = fs->entry)->start < end) { if (end - entry->start < size) start = entry->start; else start = end - size; pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED); pstart = OFF_TO_IDX(entry->offset) + atop(start - entry->start); m_next = vm_page_find_least(first_object, pstart); pend = OFF_TO_IDX(entry->offset) + atop(end - entry->start); while ((m = m_next) != NULL && m->pindex < pend) { m_next = TAILQ_NEXT(m, listq); if (!vm_page_all_valid(m) || vm_page_busied(m)) continue; /* * Don't clear PGA_REFERENCED, since it would * likely represent a reference by a different * process. * * Typically, at this point, prefetched pages * are still in the inactive queue. Only * pages that triggered page faults are in the * active queue. The test for whether the page * is in the inactive queue is racy; in the * worst case we will requeue the page * unnecessarily. */ if (!vm_page_inactive(m)) vm_page_deactivate(m); } } VM_OBJECT_RUNLOCK(first_object); } } /* * vm_fault_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of vm_map_pmap_enter, except it runs at page fault time instead * of mmap time. */ static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int backward, int forward, bool obj_locked) { pmap_t pmap; vm_map_entry_t entry; vm_object_t backing_object, lobject; vm_offset_t addr, starta; vm_pindex_t pindex; vm_page_t m; int i; pmap = fs->map->pmap; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) return; entry = fs->entry; if (addra < backward * PAGE_SIZE) { starta = entry->start; } else { starta = addra - backward * PAGE_SIZE; if (starta < entry->start) starta = entry->start; } /* * Generate the sequence of virtual addresses that are candidates for * prefaulting in an outward spiral from the faulting virtual address, * "addra". Specifically, the sequence is "addra - PAGE_SIZE", "addra * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ... * If the candidate address doesn't have a backing physical page, then * the loop immediately terminates. */ for (i = 0; i < 2 * imax(backward, forward); i++) { addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE : PAGE_SIZE); if (addr > addra + forward * PAGE_SIZE) addr = 0; if (addr < starta || addr >= entry->end) continue; if (!pmap_is_prefaultable(pmap, addr)) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = entry->object.vm_object; if (!obj_locked) VM_OBJECT_RLOCK(lobject); while ((m = vm_page_lookup(lobject, pindex)) == NULL && lobject->type == OBJT_DEFAULT && (backing_object = lobject->backing_object) != NULL) { KASSERT((lobject->backing_object_offset & PAGE_MASK) == 0, ("vm_fault_prefault: unaligned object offset")); pindex += lobject->backing_object_offset >> PAGE_SHIFT; VM_OBJECT_RLOCK(backing_object); if (!obj_locked || lobject != entry->object.vm_object) VM_OBJECT_RUNLOCK(lobject); lobject = backing_object; } if (m == NULL) { if (!obj_locked || lobject != entry->object.vm_object) VM_OBJECT_RUNLOCK(lobject); break; } if (vm_page_all_valid(m) && (m->flags & PG_FICTITIOUS) == 0) pmap_enter_quick(pmap, addr, m, entry->protection); if (!obj_locked || lobject != entry->object.vm_object) VM_OBJECT_RUNLOCK(lobject); } } /* * Hold each of the physical pages that are mapped by the specified range of * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid * and allow the specified types of access, "prot". If all of the implied * pages are successfully held, then the number of held pages is returned * together with pointers to those pages in the array "ma". However, if any * of the pages cannot be held, -1 is returned. */ int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, vm_prot_t prot, vm_page_t *ma, int max_count) { vm_offset_t end, va; vm_page_t *mp; int count; boolean_t pmap_failed; if (len == 0) return (0); end = round_page(addr + len); addr = trunc_page(addr); /* * Check for illegal addresses. */ if (addr < vm_map_min(map) || addr > end || end > vm_map_max(map)) return (-1); if (atop(end - addr) > max_count) panic("vm_fault_quick_hold_pages: count > max_count"); count = atop(end - addr); /* * Most likely, the physical pages are resident in the pmap, so it is * faster to try pmap_extract_and_hold() first. */ pmap_failed = FALSE; for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) { *mp = pmap_extract_and_hold(map->pmap, va, prot); if (*mp == NULL) pmap_failed = TRUE; else if ((prot & VM_PROT_WRITE) != 0 && (*mp)->dirty != VM_PAGE_BITS_ALL) { /* * Explicitly dirty the physical page. Otherwise, the * caller's changes may go unnoticed because they are * performed through an unmanaged mapping or by a DMA * operation. * * The object lock is not held here. * See vm_page_clear_dirty_mask(). */ vm_page_dirty(*mp); } } if (pmap_failed) { /* * One or more pages could not be held by the pmap. Either no * page was mapped at the specified virtual address or that * mapping had insufficient permissions. Attempt to fault in * and hold these pages. * * If vm_fault_disable_pagefaults() was called, * i.e., TDP_NOFAULTING is set, we must not sleep nor * acquire MD VM locks, which means we must not call * vm_fault(). Some (out of tree) callers mark * too wide a code area with vm_fault_disable_pagefaults() * already, use the VM_PROT_QUICK_NOFAULT flag to request * the proper behaviour explicitly. */ if ((prot & VM_PROT_QUICK_NOFAULT) != 0 && (curthread->td_pflags & TDP_NOFAULTING) != 0) goto error; for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) if (*mp == NULL && vm_fault(map, va, prot, VM_FAULT_NORMAL, mp) != KERN_SUCCESS) goto error; } return (count); error: for (mp = ma; mp < ma + count; mp++) if (*mp != NULL) vm_page_unwire(*mp, PQ_INACTIVE); return (-1); } /* * Routine: * vm_fault_copy_entry * Function: * Create new shadow object backing dst_entry with private copy of * all underlying pages. When src_entry is equal to dst_entry, * function implements COW for wired-down map entry. Otherwise, * it forks wired entry into dst_map. * * In/out conditions: * The source and destination maps must be locked for write. * The source map entry must be wired down (or be a sharing map * entry corresponding to a main map entry that is wired down). */ void vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, vm_map_entry_t dst_entry, vm_map_entry_t src_entry, vm_ooffset_t *fork_charge) { vm_object_t backing_object, dst_object, object, src_object; vm_pindex_t dst_pindex, pindex, src_pindex; vm_prot_t access, prot; vm_offset_t vaddr; vm_page_t dst_m; vm_page_t src_m; boolean_t upgrade; #ifdef lint src_map++; #endif /* lint */ upgrade = src_entry == dst_entry; access = prot = dst_entry->protection; src_object = src_entry->object.vm_object; src_pindex = OFF_TO_IDX(src_entry->offset); if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) { dst_object = src_object; vm_object_reference(dst_object); } else { /* * Create the top-level object for the destination entry. * Doesn't actually shadow anything - we copy the pages * directly. */ dst_object = vm_object_allocate_anon(atop(dst_entry->end - dst_entry->start), NULL, NULL, 0); #if VM_NRESERVLEVEL > 0 dst_object->flags |= OBJ_COLORED; dst_object->pg_color = atop(dst_entry->start); #endif dst_object->domain = src_object->domain; dst_object->charge = dst_entry->end - dst_entry->start; } VM_OBJECT_WLOCK(dst_object); KASSERT(upgrade || dst_entry->object.vm_object == NULL, ("vm_fault_copy_entry: vm_object not NULL")); if (src_object != dst_object) { dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; dst_entry->eflags &= ~MAP_ENTRY_VN_EXEC; } if (fork_charge != NULL) { KASSERT(dst_entry->cred == NULL, ("vm_fault_copy_entry: leaked swp charge")); dst_object->cred = curthread->td_ucred; crhold(dst_object->cred); *fork_charge += dst_object->charge; } else if ((dst_object->type == OBJT_DEFAULT || dst_object->type == OBJT_SWAP) && dst_object->cred == NULL) { KASSERT(dst_entry->cred != NULL, ("no cred for entry %p", dst_entry)); dst_object->cred = dst_entry->cred; dst_entry->cred = NULL; } /* * If not an upgrade, then enter the mappings in the pmap as * read and/or execute accesses. Otherwise, enter them as * write accesses. * * A writeable large page mapping is only created if all of * the constituent small page mappings are modified. Marking * PTEs as modified on inception allows promotion to happen * without taking potentially large number of soft faults. */ if (!upgrade) access &= ~VM_PROT_WRITE; /* * Loop through all of the virtual pages within the entry's * range, copying each page from the source object to the * destination object. Since the source is wired, those pages * must exist. In contrast, the destination is pageable. * Since the destination object doesn't share any backing storage * with the source object, all of its pages must be dirtied, * regardless of whether they can be written. */ for (vaddr = dst_entry->start, dst_pindex = 0; vaddr < dst_entry->end; vaddr += PAGE_SIZE, dst_pindex++) { again: /* * Find the page in the source object, and copy it in. * Because the source is wired down, the page will be * in memory. */ if (src_object != dst_object) VM_OBJECT_RLOCK(src_object); object = src_object; pindex = src_pindex + dst_pindex; while ((src_m = vm_page_lookup(object, pindex)) == NULL && (backing_object = object->backing_object) != NULL) { /* * Unless the source mapping is read-only or * it is presently being upgraded from * read-only, the first object in the shadow * chain should provide all of the pages. In * other words, this loop body should never be * executed when the source mapping is already * read/write. */ KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 || upgrade, ("vm_fault_copy_entry: main object missing page")); VM_OBJECT_RLOCK(backing_object); pindex += OFF_TO_IDX(object->backing_object_offset); if (object != dst_object) VM_OBJECT_RUNLOCK(object); object = backing_object; } KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing")); if (object != dst_object) { /* * Allocate a page in the destination object. */ dst_m = vm_page_alloc(dst_object, (src_object == dst_object ? src_pindex : 0) + dst_pindex, VM_ALLOC_NORMAL); if (dst_m == NULL) { VM_OBJECT_WUNLOCK(dst_object); VM_OBJECT_RUNLOCK(object); vm_wait(dst_object); VM_OBJECT_WLOCK(dst_object); goto again; } pmap_copy_page(src_m, dst_m); VM_OBJECT_RUNLOCK(object); dst_m->dirty = dst_m->valid = src_m->valid; } else { dst_m = src_m; if (vm_page_busy_acquire(dst_m, VM_ALLOC_WAITFAIL) == 0) goto again; if (dst_m->pindex >= dst_object->size) { /* * We are upgrading. Index can occur * out of bounds if the object type is * vnode and the file was truncated. */ vm_page_xunbusy(dst_m); break; } } VM_OBJECT_WUNLOCK(dst_object); /* * Enter it in the pmap. If a wired, copy-on-write * mapping is being replaced by a write-enabled * mapping, then wire that new mapping. * * The page can be invalid if the user called * msync(MS_INVALIDATE) or truncated the backing vnode * or shared memory object. In this case, do not * insert it into pmap, but still do the copy so that * all copies of the wired map entry have similar * backing pages. */ if (vm_page_all_valid(dst_m)) { pmap_enter(dst_map->pmap, vaddr, dst_m, prot, access | (upgrade ? PMAP_ENTER_WIRED : 0), 0); } /* * Mark it no longer busy, and put it on the active list. */ VM_OBJECT_WLOCK(dst_object); if (upgrade) { if (src_m != dst_m) { vm_page_unwire(src_m, PQ_INACTIVE); vm_page_wire(dst_m); } else { KASSERT(vm_page_wired(dst_m), ("dst_m %p is not wired", dst_m)); } } else { vm_page_activate(dst_m); } vm_page_xunbusy(dst_m); } VM_OBJECT_WUNLOCK(dst_object); if (upgrade) { dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY); vm_object_deallocate(src_object); } } /* * Block entry into the machine-independent layer's page fault handler by * the calling thread. Subsequent calls to vm_fault() by that thread will * return KERN_PROTECTION_FAILURE. Enable machine-dependent handling of * spurious page faults. */ int vm_fault_disable_pagefaults(void) { return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR)); } void vm_fault_enable_pagefaults(int save) { curthread_pflags_restore(save); }