Index: head/sys/amd64/amd64/machdep.c =================================================================== --- head/sys/amd64/amd64/machdep.c (revision 366710) +++ head/sys/amd64/amd64/machdep.c (revision 366711) @@ -1,2825 +1,2826 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_atpic.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_pci.h" #include "opt_platform.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include +#include #include #include #include #include #include #include -#include #include +#include #ifdef DDB #ifndef KDB #error KDB must be enabled in order for DDB to work! #endif #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifdef FDT #include #endif #ifdef DEV_ATPIC #include #else #include #endif #include #include #include /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); /* * The PTI trampoline stack needs enough space for a hardware trapframe and a * couple of scratch registers, as well as the trapframe left behind after an * iret fault. */ CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - offsetof(struct pti_frame, pti_rip)); extern u_int64_t hammer_time(u_int64_t, u_int64_t); #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) static void cpu_startup(void *); static void get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, size_t xfpusave_len); static int set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, size_t xfpustate_len); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); /* Preload data parse function */ static caddr_t native_parse_preload_data(u_int64_t); /* Native function to fetch and parse the e820 map */ static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); /* Default init_ops implementation. */ struct init_ops init_ops = { .parse_preload_data = native_parse_preload_data, .early_clock_source_init = i8254_init, .early_delay = i8254_delay, .parse_memmap = native_parse_memmap, #ifdef SMP .mp_bootaddress = mp_bootaddress, .start_all_aps = native_start_all_aps, #endif #ifdef DEV_PCI .msi_init = msi_init, #endif }; /* * Physical address of the EFI System Table. Stashed from the metadata hints * passed into the kernel and used by the EFI code to call runtime services. */ vm_paddr_t efi_systbl_phys; /* Intel ICH registers */ #define ICH_PMBASE 0x400 #define ICH_SMI_EN ICH_PMBASE + 0x30 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; int cold = 1; long Maxmem = 0; long realmem = 0; struct kva_md_info kmi; static struct trapframe proc0_tf; struct region_descriptor r_idt; struct pcpu *__pcpu; struct pcpu temp_bsp_pcpu; struct mtx icu_lock; struct mem_range_softc mem_range_softc; struct mtx dt_lock; /* lock for GDT and LDT */ void (*vmm_resume_p)(void); static void cpu_startup(dummy) void *dummy; { uintmax_t memsize; char *sysenv; /* * On MacBooks, we need to disallow the legacy USB circuit to * generate an SMI# because this can cause several problems, * namely: incorrect CPU frequency detection and failure to * start the APs. * We do this by disabling a bit in the SMI_EN (SMI Control and * Enable register) of the Intel ICH LPC Interface Bridge. */ sysenv = kern_getenv("smbios.system.product"); if (sysenv != NULL) { if (strncmp(sysenv, "MacBook1,1", 10) == 0 || strncmp(sysenv, "MacBook3,1", 10) == 0 || strncmp(sysenv, "MacBook4,1", 10) == 0 || strncmp(sysenv, "MacBookPro1,1", 13) == 0 || strncmp(sysenv, "MacBookPro1,2", 13) == 0 || strncmp(sysenv, "MacBookPro3,1", 13) == 0 || strncmp(sysenv, "MacBookPro4,1", 13) == 0 || strncmp(sysenv, "Macmini1,1", 10) == 0) { if (bootverbose) printf("Disabling LEGACY_USB_EN bit on " "Intel ICH.\n"); outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); } freeenv(sysenv); } /* * Good {morning,afternoon,evening,night}. */ startrtclock(); printcpuinfo(); /* * Display physical memory if SMBIOS reports reasonable amount. */ memsize = 0; sysenv = kern_getenv("smbios.memory.enabled"); if (sysenv != NULL) { memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; freeenv(sysenv); } if (memsize < ptoa((uintmax_t)vm_free_count())) memsize = ptoa((uintmax_t)Maxmem); printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); realmem = atop(memsize); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size; size = phys_avail[indx + 1] - phys_avail[indx]; printf( "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", (uintmax_t)phys_avail[indx], (uintmax_t)phys_avail[indx + 1] - 1, (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", ptoa((uintmax_t)vm_free_count()), ptoa((uintmax_t)vm_free_count()) / 1048576); #ifdef DEV_PCI if (bootverbose && intel_graphics_stolen_base != 0) printf("intel stolen mem: base %#jx size %ju MB\n", (uintmax_t)intel_graphics_stolen_base, (uintmax_t)intel_graphics_stolen_size / 1024 / 1024); #endif /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); cpu_setregs(); } static void late_ifunc_resolve(void *dummy __unused) { link_elf_late_ireloc(); } SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL); /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by call * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe sf, *sfp; struct pcb *pcb; struct proc *p; struct thread *td; struct sigacts *psp; char *sp; struct trapframe *regs; char *xfpusave; size_t xfpusave_len; int sig; int oonstack; td = curthread; pcb = td->td_pcb; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); xfpusave = __builtin_alloca(xfpusave_len); } else { xfpusave_len = 0; xfpusave = NULL; } /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); fpstate_drop(td); update_pcb_bases(pcb); sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; bzero(sf.sf_uc.uc_mcontext.mc_spare, sizeof(sf.sf_uc.uc_mcontext.mc_spare)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sp = (char *)regs->tf_rsp - 128; if (xfpusave != NULL) { sp -= xfpusave_len; sp = (char *)((unsigned long)sp & ~0x3Ful); sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; } sp -= sizeof(struct sigframe); /* Align to 16 bytes. */ sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); /* Build the argument list for the signal handler. */ regs->tf_rdi = sig; /* arg 1 in %rdi */ regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ } else { /* Old FreeBSD-style arguments. */ regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || (xfpusave != NULL && copyout(xfpusave, (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) != 0)) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (long)sfp; regs->tf_rip = p->p_sysent->sv_sigcode_base; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_ss = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. * * MPSAFE */ int sys_sigreturn(td, uap) struct thread *td; struct sigreturn_args /* { const struct __ucontext *sigcntxp; } */ *uap; { ucontext_t uc; struct pcb *pcb; struct proc *p; struct trapframe *regs; ucontext_t *ucp; char *xfpustate; size_t xfpustate_len; long rflags; int cs, error, ret; ksiginfo_t ksi; pcb = td->td_pcb; p = td->td_proc; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) { uprintf("pid %d (%s): sigreturn copyin failed\n", p->p_pid, td->td_name); return (error); } ucp = &uc; if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, td->td_name, ucp->uc_mcontext.mc_flags); return (EINVAL); } regs = td->td_frame; rflags = ucp->uc_mcontext.mc_rflags; /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(rflags, regs->tf_rflags)) { uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, td->td_name, rflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; if (xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) { uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", p->p_pid, td->td_name, xfpustate_len); return (EINVAL); } xfpustate = __builtin_alloca(xfpustate_len); error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, xfpustate, xfpustate_len); if (error != 0) { uprintf( "pid %d (%s): sigreturn copying xfpustate failed\n", p->p_pid, td->td_name); return (error); } } else { xfpustate = NULL; xfpustate_len = 0; } ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); if (ret != 0) { uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", p->p_pid, td->td_name, ret); return (ret); } bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); update_pcb_bases(pcb); pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); return (EJUSTRETURN); } #ifdef COMPAT_FREEBSD4 int freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) { return sys_sigreturn(td, (struct sigreturn_args *)uap); } #endif /* * Reset registers to default values on exec. */ void exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs; struct pcb *pcb; register_t saved_rflags; regs = td->td_frame; pcb = td->td_pcb; if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); update_pcb_bases(pcb); pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; clear_pcb_flags(pcb, PCB_32BIT); pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; saved_rflags = regs->tf_rflags & PSL_T; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; regs->tf_rdi = stack; /* argv */ regs->tf_rflags = PSL_USER | saved_rflags; regs->tf_ss = _udatasel; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == curpcb) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } clear_pcb_flags(pcb, PCB_DBREGS); } /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); } void cpu_setregs(void) { register_t cr0; cr0 = rcr0(); /* * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the * BSP. See the comments there about why we set them. */ cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; load_cr0(cr0); } /* * Initialize amd64 and configure to run kernel */ /* * Initialize segments & interrupt table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ static char dblfault_stack[PAGE_SIZE] __aligned(16); static char mce0_stack[PAGE_SIZE] __aligned(16); static char nmi0_stack[PAGE_SIZE] __aligned(16); static char dbg0_stack[PAGE_SIZE] __aligned(16); CTASSERT(sizeof(struct nmi_pcpu) == 16); /* * Software prototypes -- in more palatable form. * * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same * slots as corresponding segments for i386 kernel. */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GNULL2_SEL 1 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUFS32_SEL 2 32 bit %gs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUGS32_SEL 3 32 bit %fs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GCODE_SEL 4 Code Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GDATA_SEL 5 Data Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GUCODE32_SEL 6 32 bit Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUCODE_SEL 8 64 bit Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { .ssd_base = 0x0, .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, .ssd_type = SDT_SYSTSS, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Actually, the TSS is a system descriptor which is double size */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 11 LDT Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 12 LDT Descriptor, double size */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, }; _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT"); void setidt(int idx, inthand_t *func, int typ, int dpl, int ist) { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (uintptr_t)func; ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); ip->gd_ist = ist; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((uintptr_t)func)>>16 ; } extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(dblfault), IDTVEC(div_pti), IDTVEC(bpt_pti), IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), IDTVEC(xmm_pti), #ifdef KDTRACE_HOOKS IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), #endif #ifdef XENHVM IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), #endif IDTVEC(fast_syscall), IDTVEC(fast_syscall32), IDTVEC(fast_syscall_pti); #ifdef DDB /* * Display the index and function name of any IDT entries that don't use * the default 'rsvd' entry point. */ DB_SHOW_COMMAND(idt, db_show_idt) { struct gate_descriptor *ip; int idx; uintptr_t func; ip = idt; for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); if (func != (uintptr_t)&IDTVEC(rsvd)) { db_printf("%3d\t", idx); db_printsym(func, DB_STGY_PROC); db_printf("\n"); } ip++; } } /* Show privileged registers. */ DB_SHOW_COMMAND(sysregs, db_show_sysregs) { struct { uint16_t limit; uint64_t base; } __packed idtr, gdtr; uint16_t ldt, tr; __asm __volatile("sidt %0" : "=m" (idtr)); db_printf("idtr\t0x%016lx/%04x\n", (u_long)idtr.base, (u_int)idtr.limit); __asm __volatile("sgdt %0" : "=m" (gdtr)); db_printf("gdtr\t0x%016lx/%04x\n", (u_long)gdtr.base, (u_int)gdtr.limit); __asm __volatile("sldt %0" : "=r" (ldt)); db_printf("ldtr\t0x%04x\n", ldt); __asm __volatile("str %0" : "=r" (tr)); db_printf("tr\t0x%04x\n", tr); db_printf("cr0\t0x%016lx\n", rcr0()); db_printf("cr2\t0x%016lx\n", rcr2()); db_printf("cr3\t0x%016lx\n", rcr3()); db_printf("cr4\t0x%016lx\n", rcr4()); if (rcr4() & CR4_XSAVE) db_printf("xcr0\t0x%016lx\n", rxcr(0)); db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) db_printf("FEATURES_CTL\t%016lx\n", rdmsr(MSR_IA32_FEATURE_CONTROL)); db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); } DB_SHOW_COMMAND(dbregs, db_show_dbregs) { db_printf("dr0\t0x%016lx\n", rdr0()); db_printf("dr1\t0x%016lx\n", rdr1()); db_printf("dr2\t0x%016lx\n", rdr2()); db_printf("dr3\t0x%016lx\n", rdr3()); db_printf("dr6\t0x%016lx\n", rdr6()); db_printf("dr7\t0x%016lx\n", rdr7()); } #endif void sdtossd(sd, ssd) struct user_segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_long = sd->sd_long; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } void ssdtosd(ssd, sd) struct soft_segment_descriptor *ssd; struct user_segment_descriptor *sd; { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_long = ssd->ssd_long; sd->sd_def32 = ssd->ssd_def32; sd->sd_gran = ssd->ssd_gran; } void ssdtosyssd(ssd, sd) struct soft_segment_descriptor *ssd; struct system_segment_descriptor *sd; { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_gran = ssd->ssd_gran; } #if !defined(DEV_ATPIC) && defined(DEV_ISA) #include #include /* * Return a bitmap of the current interrupt requests. This is 8259-specific * and is only suitable for use at probe time. * This is only here to pacify sio. It is NOT FATAL if this doesn't work. * It shouldn't be here. There should probably be an APIC centric * implementation in the apic driver code, if at all. */ intrmask_t isa_irq_pending(void) { u_char irr1; u_char irr2; irr1 = inb(IO_ICU1); irr2 = inb(IO_ICU2); return ((irr2 << 8) | irr1); } #endif u_int basemem; static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, int *physmap_idxp) { int i, insert_idx, physmap_idx; physmap_idx = *physmap_idxp; if (length == 0) return (1); /* * Find insertion point while checking for overlap. Start off by * assuming the new entry will be added to the end. * * NB: physmap_idx points to the next free slot. */ insert_idx = physmap_idx; for (i = 0; i <= physmap_idx; i += 2) { if (base < physmap[i + 1]) { if (base + length <= physmap[i]) { insert_idx = i; break; } if (boothowto & RB_VERBOSE) printf( "Overlapping memory regions, ignoring second region\n"); return (1); } } /* See if we can prepend to the next entry. */ if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { physmap[insert_idx] = base; return (1); } /* See if we can append to the previous entry. */ if (insert_idx > 0 && base == physmap[insert_idx - 1]) { physmap[insert_idx - 1] += length; return (1); } physmap_idx += 2; *physmap_idxp = physmap_idx; if (physmap_idx == PHYS_AVAIL_ENTRIES) { printf( "Too many segments in the physical address map, giving up\n"); return (0); } /* * Move the last 'N' entries down to make room for the new * entry if needed. */ for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { physmap[i] = physmap[i - 2]; physmap[i + 1] = physmap[i - 1]; } /* Insert the new entry. */ physmap[insert_idx] = base; physmap[insert_idx + 1] = base + length; return (1); } void bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, vm_paddr_t *physmap, int *physmap_idx) { struct bios_smap *smap, *smapend; smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); for (smap = smapbase; smap < smapend; smap++) { if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%016lx len=%016lx\n", smap->type, smap->base, smap->length); if (smap->type != SMAP_TYPE_MEMORY) continue; if (!add_physmap_entry(smap->base, smap->length, physmap, physmap_idx)) break; } } static void add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, int *physmap_idx) { struct efi_md *map, *p; const char *type; size_t efisz; int ndesc, i; static const char *types[] = { "Reserved", "LoaderCode", "LoaderData", "BootServicesCode", "BootServicesData", "RuntimeServicesCode", "RuntimeServicesData", "ConventionalMemory", "UnusableMemory", "ACPIReclaimMemory", "ACPIMemoryNVS", "MemoryMappedIO", "MemoryMappedIOPortSpace", "PalCode", "PersistentMemory" }; /* * Memory map data provided by UEFI via the GetMemoryMap * Boot Services API. */ efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; map = (struct efi_md *)((uint8_t *)efihdr + efisz); if (efihdr->descriptor_size == 0) return; ndesc = efihdr->memory_size / efihdr->descriptor_size; if (boothowto & RB_VERBOSE) printf("%23s %12s %12s %8s %4s\n", "Type", "Physical", "Virtual", "#Pages", "Attr"); for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p, efihdr->descriptor_size)) { if (boothowto & RB_VERBOSE) { if (p->md_type < nitems(types)) type = types[p->md_type]; else type = ""; printf("%23s %012lx %12p %08lx ", type, p->md_phys, p->md_virt, p->md_pages); if (p->md_attr & EFI_MD_ATTR_UC) printf("UC "); if (p->md_attr & EFI_MD_ATTR_WC) printf("WC "); if (p->md_attr & EFI_MD_ATTR_WT) printf("WT "); if (p->md_attr & EFI_MD_ATTR_WB) printf("WB "); if (p->md_attr & EFI_MD_ATTR_UCE) printf("UCE "); if (p->md_attr & EFI_MD_ATTR_WP) printf("WP "); if (p->md_attr & EFI_MD_ATTR_RP) printf("RP "); if (p->md_attr & EFI_MD_ATTR_XP) printf("XP "); if (p->md_attr & EFI_MD_ATTR_NV) printf("NV "); if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) printf("MORE_RELIABLE "); if (p->md_attr & EFI_MD_ATTR_RO) printf("RO "); if (p->md_attr & EFI_MD_ATTR_RT) printf("RUNTIME"); printf("\n"); } switch (p->md_type) { case EFI_MD_TYPE_CODE: case EFI_MD_TYPE_DATA: case EFI_MD_TYPE_BS_CODE: case EFI_MD_TYPE_BS_DATA: case EFI_MD_TYPE_FREE: /* * We're allowed to use any entry with these types. */ break; default: continue; } if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), physmap, physmap_idx)) break; } } static char bootmethod[16] = ""; SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, "System firmware boot method"); static void native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) { struct bios_smap *smap; struct efi_map_header *efihdr; u_int32_t size; /* * Memory map from INT 15:E820. * * subr_module.c says: * "Consumer may safely assume that size value precedes data." * ie: an int32_t immediately precedes smap. */ efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); smap = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (efihdr == NULL && smap == NULL) panic("No BIOS smap or EFI map info from loader!"); if (efihdr != NULL) { add_efi_map_entries(efihdr, physmap, physmap_idx); strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); } else { size = *((u_int32_t *)smap - 1); bios_add_smap_entries(smap, size, physmap, physmap_idx); strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); } } #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * XXX first should be vm_paddr_t. */ static void getmemsize(caddr_t kmdp, u_int64_t first) { int i, physmap_idx, pa_indx, da_indx; vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES]; u_long physmem_start, physmem_tunable, memtest; pt_entry_t *pte; quad_t dcons_addr, dcons_size; int page_counter; /* * Tell the physical memory allocator about pages used to store * the kernel and preloaded data. See kmem_bootstrap_free(). */ vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first)); bzero(physmap, sizeof(physmap)); physmap_idx = 0; init_ops.parse_memmap(kmdp, physmap, &physmap_idx); physmap_idx -= 2; /* * Find the 'base memory' segment for SMP */ basemem = 0; for (i = 0; i <= physmap_idx; i += 2) { if (physmap[i] <= 0xA0000) { basemem = physmap[i + 1] / 1024; break; } } if (basemem == 0 || basemem > 640) { if (bootverbose) printf( "Memory map doesn't contain a basemem segment, faking it"); basemem = 640; } /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) Maxmem = atop(physmem_tunable); /* * The boot memory test is disabled by default, as it takes a * significant amount of time on large-memory systems, and is * unfriendly to virtual machines as it unnecessarily touches all * pages. * * A general name is used as the code may be extended to support * additional tests beyond the current "page present" test. */ memtest = 0; TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); /* * Don't allow MAXMEM or hw.physmem to extend the amount of memory * in the system. */ if (Maxmem > atop(physmap[physmap_idx + 1])) Maxmem = atop(physmap[physmap_idx + 1]); if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %ldK\n", Maxmem * 4); /* * Make hole for "AP -> long mode" bootstrap code. The * mp_bootaddress vector is only available when the kernel * is configured to support APs and APs for the system start * in real mode mode (e.g. SMP bare metal). */ if (init_ops.mp_bootaddress) init_ops.mp_bootaddress(physmap, &physmap_idx); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(&first); /* * Size up each available chunk of physical memory. * * XXX Some BIOSes corrupt low 64KB between suspend and resume. * By default, mask off the first 16 pages unless we appear to be * running in a VM. */ physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); if (physmap[0] < physmem_start) { if (physmem_start < PAGE_SIZE) physmap[0] = PAGE_SIZE; else if (physmem_start >= physmap[1]) physmap[0] = round_page(physmap[1] - PAGE_SIZE); else physmap[0] = round_page(physmem_start); } pa_indx = 0; da_indx = 1; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; dump_avail[da_indx] = physmap[0]; pte = CMAP1; /* * Get dcons buffer address */ if (getenv_quad("dcons.addr", &dcons_addr) == 0 || getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ page_counter = 0; if (memtest != 0) printf("Testing system memory"); for (i = 0; i <= physmap_idx; i += 2) { vm_paddr_t end; end = ptoa((vm_paddr_t)Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad, full; int *ptr = (int *)CADDR1; full = FALSE; /* * block out kernel memory as not available. */ if (pa >= (vm_paddr_t)kernphys && pa < first) goto do_dump_avail; /* * block out dcons buffer */ if (dcons_addr > 0 && pa >= trunc_page(dcons_addr) && pa < dcons_addr + dcons_size) goto do_dump_avail; page_bad = FALSE; if (memtest == 0) goto skip_memtest; /* * Print a "." every GB to show we're making * progress. */ page_counter++; if ((page_counter % PAGES_PER_GB) == 0) printf("."); /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) page_bad = TRUE; /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) page_bad = TRUE; /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) page_bad = TRUE; /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) page_bad = TRUE; /* * Restore original value. */ *(int *)ptr = tmp; skip_memtest: /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) continue; /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ENTRIES) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; full = TRUE; goto do_dump_avail; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; do_dump_avail: if (dump_avail[da_indx] == pa) { dump_avail[da_indx] += PAGE_SIZE; } else { da_indx++; if (da_indx == PHYS_AVAIL_ENTRIES) { da_indx--; goto do_next; } dump_avail[da_indx++] = pa; /* start */ dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ } do_next: if (full) break; } } *pte = 0; invltlb(); if (memtest != 0) printf("\n"); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(msgbufsize) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(msgbufsize); /* Map the message buffer. */ msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); } static caddr_t native_parse_preload_data(u_int64_t modulep) { caddr_t kmdp; char *envp; #ifdef DDB vm_offset_t ksym_start; vm_offset_t ksym_end; #endif preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); preload_bootstrap_relocate(KERNBASE); kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); if (envp != NULL) envp += KERNBASE; init_static_kenv(envp, 0); #ifdef DDB ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); db_fetch_ksymtab(ksym_start, ksym_end, 0); #endif efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); return (kmdp); } static void amd64_kdb_init(void) { kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } /* Set up the fast syscall stuff */ void amd64_conf_fast_syscall(void) { uint64_t msr; msr = rdmsr(MSR_EFER) | EFER_SCE; wrmsr(MSR_EFER, msr); wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : (u_int64_t)IDTVEC(fast_syscall)); wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); wrmsr(MSR_STAR, msr); wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC); } void amd64_bsp_pcpu_init1(struct pcpu *pc) { struct user_segment_descriptor *gdt; PCPU_SET(prvspace, pc); gdt = *PCPU_PTR(gdt); PCPU_SET(curthread, &thread0); PCPU_SET(tssp, PCPU_PTR(common_tss)); PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); PCPU_SET(fs32p, &gdt[GUFS32_SEL]); PCPU_SET(gs32p, &gdt[GUGS32_SEL]); PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); PCPU_SET(smp_tlb_gen, 1); } void amd64_bsp_pcpu_init2(uint64_t rsp0) { PCPU_SET(rsp0, rsp0); PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); PCPU_SET(curpcb, thread0.td_pcb); } void amd64_bsp_ist_init(struct pcpu *pc) { struct nmi_pcpu *np; struct amd64tss *tssp; tssp = &pc->pc_common_tss; /* doublefault stack space, runs on ist1 */ np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1; np->np_pcpu = (register_t)pc; tssp->tss_ist1 = (long)np; /* * NMI stack, runs on ist2. The pcpu pointer is stored just * above the start of the ist2 stack. */ np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1; np->np_pcpu = (register_t)pc; tssp->tss_ist2 = (long)np; /* * MC# stack, runs on ist3. The pcpu pointer is stored just * above the start of the ist3 stack. */ np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1; np->np_pcpu = (register_t)pc; tssp->tss_ist3 = (long)np; /* * DB# stack, runs on ist4. */ np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1; np->np_pcpu = (register_t)pc; tssp->tss_ist4 = (long)np; } u_int64_t hammer_time(u_int64_t modulep, u_int64_t physfree) { caddr_t kmdp; int gsel_tss, x; struct pcpu *pc; struct xstate_hdr *xhdr; u_int64_t rsp0; char *env; struct user_segment_descriptor *gdt; struct region_descriptor r_gdt; size_t kstack0_sz; int late_console; TSRAW(&thread0, TS_ENTER, __func__, NULL); kmdp = init_ops.parse_preload_data(modulep); physfree += ucode_load_bsp(physfree + KERNBASE); physfree = roundup2(physfree, PAGE_SIZE); identify_cpu1(); identify_hypervisor(); identify_cpu_fixup_bsp(); identify_cpu2(); initializecpucache(); /* * Check for pti, pcid, and invpcid before ifuncs are * resolved, to correctly select the implementation for * pmap_activate_sw_mode(). */ pti = pti_get_default(); TUNABLE_INT_FETCH("vm.pmap.pti", &pti); TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) != 0; } else { pmap_pcid_enabled = 0; } link_elf_ireloc(kmdp); /* * This may be done better later if it gets more high level * components in it. If so just link td->td_proc here. */ proc_linkup0(&proc0, &thread0); /* Init basic tunables, hz etc */ init_param1(); thread0.td_kstack = physfree + KERNBASE; thread0.td_kstack_pages = kstack_pages; kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; bzero((void *)thread0.td_kstack, kstack0_sz); physfree += kstack0_sz; /* * Initialize enough of thread0 for delayed invalidation to * work very early. Rely on thread0.td_base_pri * zero-initialization, it is reset to PVM at proc0_init(). */ pmap_thread_init_invl_gen(&thread0); pc = &temp_bsp_pcpu; pcpu_init(pc, 0, sizeof(struct pcpu)); gdt = &temp_bsp_pcpu.pc_gdt[0]; /* * make gdt memory segments */ for (x = 0; x < NGDT; x++) { if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) ssdtosd(&gdt_segs[x], &gdt[x]); } gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss; ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (long)gdt; lgdt(&r_gdt); wrmsr(MSR_FSBASE, 0); /* User value */ wrmsr(MSR_GSBASE, (u_int64_t)pc); wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ dpcpu_init((void *)(physfree + KERNBASE), 0); physfree += DPCPU_SIZE; amd64_bsp_pcpu_init1(pc); /* Non-late cninit() and printf() can be moved up to here. */ /* * Initialize mutexes. * * icu_lock: in order to allow an interrupt to occur in a critical * section, to set pcpu->ipending (etc...) properly, we * must be able to get the icu lock, so it can't be * under witness. */ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, SEL_UPL, 0); setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); #ifdef KDTRACE_HOOKS setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); #endif #ifdef XENHVM setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); #endif r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (long) idt; lidt(&r_idt); /* * Initialize the clock before the console so that console * initialization can use DELAY(). */ clock_init(); /* * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) * transition). * Once bootblocks have updated, we can test directly for * efi_systbl != NULL here... */ if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) != NULL) vty_set_preferred(VTY_VT); TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable); TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable); TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush", &syscall_ret_l1d_flush_mode); TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable); TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable); TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable); TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable", &x86_rngds_mitg_enable); finishidentcpu(); /* Final stage of CPU initialization */ initializecpu(); /* Initialize CPU registers */ amd64_bsp_ist_init(pc); /* Set the IO permission bitmap (empty due to tss seg limit) */ pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); amd64_conf_fast_syscall(); /* * We initialize the PCB pointer early so that exception * handlers will work. Also set up td_critnest to short-cut * the page fault handler. */ cpu_max_ext_state_size = sizeof(struct savefpu); set_top_of_stack_td(&thread0); thread0.td_pcb = get_pcb_td(&thread0); thread0.td_critnest = 1; /* * The console and kdb should be initialized even earlier than here, * but some console drivers don't work until after getmemsize(). * Default to late console initialization to support these drivers. * This loses mainly printf()s in getmemsize() and early debugging. */ late_console = 1; TUNABLE_INT_FETCH("debug.late_console", &late_console); if (!late_console) { cninit(); amd64_kdb_init(); } getmemsize(kmdp, physfree); init_param2(physmem); /* now running on new page tables, configured,and u/iom is accessible */ #ifdef DEV_PCI /* This call might adjust phys_avail[]. */ pci_early_quirks(); #endif if (late_console) cninit(); /* * Dump the boot metadata. We have to wait for cninit() since console * output is required. If it's grossly incorrect the kernel will never * make it this far. */ if ((boothowto & RB_VERBOSE) && getenv_is_true("debug.dump_modinfo_at_boot")) preload_dump(); #ifdef DEV_ISA #ifdef DEV_ATPIC elcr_probe(); atpic_startup(); #else /* Reset and mask the atpics and leave them shut down. */ atpic_reset(); /* * Point the ICU spurious interrupt vectors at the APIC spurious * interrupt handler. */ setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); #endif #else #error "have you forgotten the isa device?" #endif if (late_console) amd64_kdb_init(); msgbufinit(msgbufp, msgbufsize); fpuinit(); /* * Reinitialize thread0's stack base now that the xsave area size is * known. Set up thread0's pcb save area after fpuinit calculated fpu * save area size. Zero out the extended state header in fpu save area. */ set_top_of_stack_td(&thread0); thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size); if (use_xsave) { xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1); xhdr->xstate_bv = xsave_mask; } /* make an initial tss so cpu can get interrupt stack on syscall! */ rsp0 = thread0.td_md.md_stack_base; /* Ensure the stack is aligned to 16 bytes */ rsp0 &= ~0xFul; PCPU_PTR(common_tss)->tss_rsp0 = rsp0; amd64_bsp_pcpu_init2(rsp0); /* transfer to user mode */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); _udatasel = GSEL(GUDATA_SEL, SEL_UPL); _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); _ufssel = GSEL(GUFS32_SEL, SEL_UPL); _ugssel = GSEL(GUGS32_SEL, SEL_UPL); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; thread0.td_frame = &proc0_tf; env = kern_getenv("kernelname"); if (env != NULL) strlcpy(kernelname, env, sizeof(kernelname)); cpu_probe_amdc1e(); kcsan_cpu_init(0); #ifdef FDT x86_init_fdt(); #endif thread0.td_critnest = 0; TSEXIT(); /* Location of kernel stack for locore */ return (thread0.td_md.md_stack_base); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = 0xffffffff; } static int smap_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct bios_smap *smapbase; struct bios_smap_xattr smap; caddr_t kmdp; uint32_t *smapattr; int count, error, i; /* Retrieve the system memory map from the loader. */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase == NULL) return (0); smapattr = (uint32_t *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP_XATTR); count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); error = 0; for (i = 0; i < count; i++) { smap.base = smapbase[i].base; smap.length = smapbase[i].length; smap.type = smapbase[i].type; if (smapattr != NULL) smap.xattr = smapattr[i]; else smap.xattr = 0; error = SYSCTL_OUT(req, &smap, sizeof(smap)); } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); static int efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct efi_map_header *efihdr; caddr_t kmdp; uint32_t efisize; kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); if (efihdr == NULL) return (0); efisize = *((uint32_t *)efihdr - 1); return (SYSCTL_OUT(req, efihdr, efisize)); } SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); void spinlock_enter(void) { struct thread *td; register_t flags; td = curthread; if (td->td_md.md_spinlock_count == 0) { flags = intr_disable(); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_flags = flags; critical_enter(); } else td->td_md.md_spinlock_count++; } void spinlock_exit(void) { struct thread *td; register_t flags; td = curthread; flags = td->td_md.md_saved_flags; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) { critical_exit(); intr_restore(flags); } } /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_r12 = tf->tf_r12; pcb->pcb_r13 = tf->tf_r13; pcb->pcb_r14 = tf->tf_r14; pcb->pcb_r15 = tf->tf_r15; pcb->pcb_rbp = tf->tf_rbp; pcb->pcb_rbx = tf->tf_rbx; pcb->pcb_rip = tf->tf_rip; pcb->pcb_rsp = tf->tf_rsp; } int ptrace_set_pc(struct thread *td, unsigned long addr) { td->td_frame->tf_rip = addr; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (0); } int ptrace_single_step(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); if ((td->td_frame->tf_rflags & PSL_T) == 0) { td->td_frame->tf_rflags |= PSL_T; td->td_dbgflags |= TDB_STEP; } return (0); } int ptrace_clear_single_step(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); td->td_frame->tf_rflags &= ~PSL_T; td->td_dbgflags &= ~TDB_STEP; return (0); } int fill_regs(struct thread *td, struct reg *regs) { struct trapframe *tp; tp = td->td_frame; return (fill_frame_regs(tp, regs)); } int fill_frame_regs(struct trapframe *tp, struct reg *regs) { regs->r_r15 = tp->tf_r15; regs->r_r14 = tp->tf_r14; regs->r_r13 = tp->tf_r13; regs->r_r12 = tp->tf_r12; regs->r_r11 = tp->tf_r11; regs->r_r10 = tp->tf_r10; regs->r_r9 = tp->tf_r9; regs->r_r8 = tp->tf_r8; regs->r_rdi = tp->tf_rdi; regs->r_rsi = tp->tf_rsi; regs->r_rbp = tp->tf_rbp; regs->r_rbx = tp->tf_rbx; regs->r_rdx = tp->tf_rdx; regs->r_rcx = tp->tf_rcx; regs->r_rax = tp->tf_rax; regs->r_rip = tp->tf_rip; regs->r_cs = tp->tf_cs; regs->r_rflags = tp->tf_rflags; regs->r_rsp = tp->tf_rsp; regs->r_ss = tp->tf_ss; if (tp->tf_flags & TF_HASSEGS) { regs->r_ds = tp->tf_ds; regs->r_es = tp->tf_es; regs->r_fs = tp->tf_fs; regs->r_gs = tp->tf_gs; } else { regs->r_ds = 0; regs->r_es = 0; regs->r_fs = 0; regs->r_gs = 0; } regs->r_err = 0; regs->r_trapno = 0; return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *tp; register_t rflags; tp = td->td_frame; rflags = regs->r_rflags & 0xffffffff; if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_r15 = regs->r_r15; tp->tf_r14 = regs->r_r14; tp->tf_r13 = regs->r_r13; tp->tf_r12 = regs->r_r12; tp->tf_r11 = regs->r_r11; tp->tf_r10 = regs->r_r10; tp->tf_r9 = regs->r_r9; tp->tf_r8 = regs->r_r8; tp->tf_rdi = regs->r_rdi; tp->tf_rsi = regs->r_rsi; tp->tf_rbp = regs->r_rbp; tp->tf_rbx = regs->r_rbx; tp->tf_rdx = regs->r_rdx; tp->tf_rcx = regs->r_rcx; tp->tf_rax = regs->r_rax; tp->tf_rip = regs->r_rip; tp->tf_cs = regs->r_cs; tp->tf_rflags = rflags; tp->tf_rsp = regs->r_rsp; tp->tf_ss = regs->r_ss; if (0) { /* XXXKIB */ tp->tf_ds = regs->r_ds; tp->tf_es = regs->r_es; tp->tf_fs = regs->r_fs; tp->tf_gs = regs->r_gs; tp->tf_flags = TF_HASSEGS; } set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (0); } /* XXX check all this stuff! */ /* externalize from sv_xmm */ static void fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) { struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* pcb -> fpregs */ bzero(fpregs, sizeof(*fpregs)); /* FPU control/status */ penv_fpreg->en_cw = penv_xmm->en_cw; penv_fpreg->en_sw = penv_xmm->en_sw; penv_fpreg->en_tw = penv_xmm->en_tw; penv_fpreg->en_opcode = penv_xmm->en_opcode; penv_fpreg->en_rip = penv_xmm->en_rip; penv_fpreg->en_rdp = penv_xmm->en_rdp; penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; /* FPU registers */ for (i = 0; i < 8; ++i) bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); /* SSE registers */ for (i = 0; i < 16; ++i) bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); } /* internalize from fpregs into sv_xmm */ static void set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) { struct envxmm *penv_xmm = &sv_xmm->sv_env; struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; int i; /* fpregs -> pcb */ /* FPU control/status */ penv_xmm->en_cw = penv_fpreg->en_cw; penv_xmm->en_sw = penv_fpreg->en_sw; penv_xmm->en_tw = penv_fpreg->en_tw; penv_xmm->en_opcode = penv_fpreg->en_opcode; penv_xmm->en_rip = penv_fpreg->en_rip; penv_xmm->en_rdp = penv_fpreg->en_rdp; penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; /* FPU registers */ for (i = 0; i < 8; ++i) bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); /* SSE registers */ for (i = 0; i < 16; ++i) bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); } /* externalize from td->pcb */ int fill_fpregs(struct thread *td, struct fpreg *fpregs) { KASSERT(td == curthread || TD_IS_SUSPENDED(td) || P_SHOULDSTOP(td->td_proc), ("not suspended thread %p", td)); fpugetregs(td); fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); return (0); } /* internalize to td->pcb */ int set_fpregs(struct thread *td, struct fpreg *fpregs) { critical_enter(); set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); fpuuserinited(td); critical_exit(); return (0); } /* * Get machine context. */ int get_mcontext(struct thread *td, mcontext_t *mcp, int flags) { struct pcb *pcb; struct trapframe *tp; pcb = td->td_pcb; tp = td->td_frame; PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(tp->tf_rsp); PROC_UNLOCK(curthread->td_proc); mcp->mc_r15 = tp->tf_r15; mcp->mc_r14 = tp->tf_r14; mcp->mc_r13 = tp->tf_r13; mcp->mc_r12 = tp->tf_r12; mcp->mc_r11 = tp->tf_r11; mcp->mc_r10 = tp->tf_r10; mcp->mc_r9 = tp->tf_r9; mcp->mc_r8 = tp->tf_r8; mcp->mc_rdi = tp->tf_rdi; mcp->mc_rsi = tp->tf_rsi; mcp->mc_rbp = tp->tf_rbp; mcp->mc_rbx = tp->tf_rbx; mcp->mc_rcx = tp->tf_rcx; mcp->mc_rflags = tp->tf_rflags; if (flags & GET_MC_CLEAR_RET) { mcp->mc_rax = 0; mcp->mc_rdx = 0; mcp->mc_rflags &= ~PSL_C; } else { mcp->mc_rax = tp->tf_rax; mcp->mc_rdx = tp->tf_rdx; } mcp->mc_rip = tp->tf_rip; mcp->mc_cs = tp->tf_cs; mcp->mc_rsp = tp->tf_rsp; mcp->mc_ss = tp->tf_ss; mcp->mc_ds = tp->tf_ds; mcp->mc_es = tp->tf_es; mcp->mc_fs = tp->tf_fs; mcp->mc_gs = tp->tf_gs; mcp->mc_flags = tp->tf_flags; mcp->mc_len = sizeof(*mcp); get_fpcontext(td, mcp, NULL, 0); update_pcb_bases(pcb); mcp->mc_fsbase = pcb->pcb_fsbase; mcp->mc_gsbase = pcb->pcb_gsbase; mcp->mc_xfpustate = 0; mcp->mc_xfpustate_len = 0; bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); return (0); } /* * Set machine context. * * However, we don't set any but the user modifiable flags, and we won't * touch the cs selector. */ int set_mcontext(struct thread *td, mcontext_t *mcp) { struct pcb *pcb; struct trapframe *tp; char *xfpustate; long rflags; int ret; pcb = td->td_pcb; tp = td->td_frame; if (mcp->mc_len != sizeof(*mcp) || (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) return (EINVAL); rflags = (mcp->mc_rflags & PSL_USERCHANGE) | (tp->tf_rflags & ~PSL_USERCHANGE); if (mcp->mc_flags & _MC_HASFPXSTATE) { if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) return (EINVAL); xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); ret = copyin((void *)mcp->mc_xfpustate, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); } else xfpustate = NULL; ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); tp->tf_r15 = mcp->mc_r15; tp->tf_r14 = mcp->mc_r14; tp->tf_r13 = mcp->mc_r13; tp->tf_r12 = mcp->mc_r12; tp->tf_r11 = mcp->mc_r11; tp->tf_r10 = mcp->mc_r10; tp->tf_r9 = mcp->mc_r9; tp->tf_r8 = mcp->mc_r8; tp->tf_rdi = mcp->mc_rdi; tp->tf_rsi = mcp->mc_rsi; tp->tf_rbp = mcp->mc_rbp; tp->tf_rbx = mcp->mc_rbx; tp->tf_rdx = mcp->mc_rdx; tp->tf_rcx = mcp->mc_rcx; tp->tf_rax = mcp->mc_rax; tp->tf_rip = mcp->mc_rip; tp->tf_rflags = rflags; tp->tf_rsp = mcp->mc_rsp; tp->tf_ss = mcp->mc_ss; tp->tf_flags = mcp->mc_flags; if (tp->tf_flags & TF_HASSEGS) { tp->tf_ds = mcp->mc_ds; tp->tf_es = mcp->mc_es; tp->tf_fs = mcp->mc_fs; tp->tf_gs = mcp->mc_gs; } set_pcb_flags(pcb, PCB_FULL_IRET); if (mcp->mc_flags & _MC_HASBASES) { pcb->pcb_fsbase = mcp->mc_fsbase; pcb->pcb_gsbase = mcp->mc_gsbase; } return (0); } static void get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, size_t xfpusave_len) { size_t max_len, len; mcp->mc_ownedfp = fpugetregs(td); bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], sizeof(mcp->mc_fpstate)); mcp->mc_fpformat = fpuformat(); if (!use_xsave || xfpusave_len == 0) return; max_len = cpu_max_ext_state_size - sizeof(struct savefpu); len = xfpusave_len; if (len > max_len) { len = max_len; bzero(xfpusave + max_len, len - max_len); } mcp->mc_flags |= _MC_HASFPXSTATE; mcp->mc_xfpustate_len = len; bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); } static int set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, size_t xfpustate_len) { int error; if (mcp->mc_fpformat == _MC_FPFMT_NODEV) return (0); else if (mcp->mc_fpformat != _MC_FPFMT_XMM) return (EINVAL); else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { /* We don't care what state is left in the FPU or PCB. */ fpstate_drop(td); error = 0; } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || mcp->mc_ownedfp == _MC_FPOWNED_PCB) { error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, xfpustate, xfpustate_len); } else return (EINVAL); return (error); } void fpstate_drop(struct thread *td) { KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); critical_enter(); if (PCPU_GET(fpcurthread) == td) fpudrop(); /* * XXX force a full drop of the fpu. The above only drops it if we * owned it. * * XXX I don't much like fpugetuserregs()'s semantics of doing a full * drop. Dropping only to the pcb matches fnsave's behaviour. * We only need to drop to !PCB_INITDONE in sendsig(). But * sendsig() is the only caller of fpugetuserregs()... perhaps we just * have too many layers. */ clear_pcb_flags(curthread->td_pcb, PCB_FPUINITDONE | PCB_USERFPUINITDONE); critical_exit(); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; if (td == NULL) { dbregs->dr[0] = rdr0(); dbregs->dr[1] = rdr1(); dbregs->dr[2] = rdr2(); dbregs->dr[3] = rdr3(); dbregs->dr[6] = rdr6(); dbregs->dr[7] = rdr7(); } else { pcb = td->td_pcb; dbregs->dr[0] = pcb->pcb_dr0; dbregs->dr[1] = pcb->pcb_dr1; dbregs->dr[2] = pcb->pcb_dr2; dbregs->dr[3] = pcb->pcb_dr3; dbregs->dr[6] = pcb->pcb_dr6; dbregs->dr[7] = pcb->pcb_dr7; } dbregs->dr[4] = 0; dbregs->dr[5] = 0; dbregs->dr[8] = 0; dbregs->dr[9] = 0; dbregs->dr[10] = 0; dbregs->dr[11] = 0; dbregs->dr[12] = 0; dbregs->dr[13] = 0; dbregs->dr[14] = 0; dbregs->dr[15] = 0; return (0); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; int i; if (td == NULL) { load_dr0(dbregs->dr[0]); load_dr1(dbregs->dr[1]); load_dr2(dbregs->dr[2]); load_dr3(dbregs->dr[3]); load_dr6(dbregs->dr[6]); load_dr7(dbregs->dr[7]); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP or a general protection fault right here. * Upper bits of dr6 and dr7 must not be set */ for (i = 0; i < 4; i++) { if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) return (EINVAL); if (td->td_frame->tf_cs == _ucode32sel && DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) return (EINVAL); } if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || (dbregs->dr[7] & 0xffffffff00000000ul) != 0) return (EINVAL); pcb = td->td_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { /* dr0 is enabled */ if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { /* dr1 is enabled */ if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { /* dr2 is enabled */ if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { /* dr3 is enabled */ if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) return (EINVAL); } pcb->pcb_dr0 = dbregs->dr[0]; pcb->pcb_dr1 = dbregs->dr[1]; pcb->pcb_dr2 = dbregs->dr[2]; pcb->pcb_dr3 = dbregs->dr[3]; pcb->pcb_dr6 = dbregs->dr[6]; pcb->pcb_dr7 = dbregs->dr[7]; set_pcb_flags(pcb, PCB_DBREGS); } return (0); } void reset_dbregs(void) { load_dr7(0); /* Turn off the control bits first */ load_dr0(0); load_dr1(0); load_dr2(0); load_dr3(0); load_dr6(0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(register_t dr6) { u_int64_t dr7; u_int64_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; bp = dr6 & DBREG_DR6_BMASK; if (bp == 0) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return 0; } dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return 0; } nbp = 0; /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i = 0; i < nbp; i++) { if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { /* * addr[i] is in user space */ return nbp; } } /* * None of the breakpoints are in user space. */ return 0; } /* * The pcb_flags is only modified by current thread, or by other threads * when current thread is stopped. However, current thread may change it * from the interrupt context in cpu_switch(), or in the trap handler. * When we read-modify-write pcb_flags from C sources, compiler may generate * code that is not atomic regarding the interrupt handler. If a trap or * interrupt happens and any flag is modified from the handler, it can be * clobbered with the cached value later. Therefore, we implement setting * and clearing flags with single-instruction functions, which do not race * with possible modification of the flags from the trap or interrupt context, * because traps and interrupts are executed only on instruction boundary. */ void set_pcb_flags_raw(struct pcb *pcb, const u_int flags) { __asm __volatile("orl %1,%0" : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) : "cc", "memory"); } /* * The support for RDFSBASE, WRFSBASE and similar instructions for %gs * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into * pcb if user space modified the bases. We must save on the context * switch or if the return to usermode happens through the doreti. * * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, * which have a consequence that the base MSRs must be saved each time * the PCB_FULL_IRET flag is set. We disable interrupts to sync with * context switches. */ static void set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags) { register_t r; if (curpcb == pcb && (flags & PCB_FULL_IRET) != 0 && (pcb->pcb_flags & PCB_FULL_IRET) == 0) { r = intr_disable(); if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { if (rfs() == _ufssel) pcb->pcb_fsbase = rdfsbase(); if (rgs() == _ugssel) pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); } set_pcb_flags_raw(pcb, flags); intr_restore(r); } else { set_pcb_flags_raw(pcb, flags); } } DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int)) { return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ? set_pcb_flags_fsgsbase : set_pcb_flags_raw); } void clear_pcb_flags(struct pcb *pcb, const u_int flags) { __asm __volatile("andl %1,%0" : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) : "cc", "memory"); } #ifdef KDB /* * Provide inb() and outb() as functions. They are normally only available as * inline functions, thus cannot be called from the debugger. */ /* silence compiler warnings */ u_char inb_(u_short); void outb_(u_short, u_char); u_char inb_(u_short port) { return inb(port); } void outb_(u_short port, u_char data) { outb(port, data); } #endif /* KDB */ #undef memset #undef memmove #undef memcpy void *memset_std(void *buf, int c, size_t len); void *memset_erms(void *buf, int c, size_t len); void *memmove_std(void * _Nonnull dst, const void * _Nonnull src, size_t len); void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src, size_t len); void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src, size_t len); void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src, size_t len); #ifdef KCSAN /* * These fail to build as ifuncs when used with KCSAN. */ void * memset(void *buf, int c, size_t len) { return (memset_std(buf, c, len)); } void * memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len) { return (memmove_std(dst, src, len)); } void * memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len) { return (memcpy_std(dst, src, len)); } #else DEFINE_IFUNC(, void *, memset, (void *, int, size_t)) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memset_erms : memset_std); } DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull, size_t)) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memmove_erms : memmove_std); } DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t)) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memcpy_erms : memcpy_std); } #endif void pagezero_std(void *addr); void pagezero_erms(void *addr); DEFINE_IFUNC(, void , pagezero, (void *)) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? pagezero_erms : pagezero_std); } Index: head/sys/amd64/amd64/minidump_machdep.c =================================================================== --- head/sys/amd64/amd64/minidump_machdep.c (revision 366710) +++ head/sys/amd64/amd64/minidump_machdep.c (revision 366711) @@ -1,451 +1,452 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_pmap.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include CTASSERT(sizeof(struct kerneldumpheader) == 512); static struct kerneldumpheader kdh; /* Handle chunked writes. */ static size_t fragsz; static void *dump_va; static size_t counter, progress, dumpsize, wdog_next; static int dump_retry_count = 5; SYSCTL_INT(_machdep, OID_AUTO, dump_retry_count, CTLFLAG_RWTUN, &dump_retry_count, 0, "Number of times dump has to retry before bailing out"); static int is_dumpable(vm_paddr_t pa) { vm_page_t m; int i; if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) return ((m->flags & PG_NODUMP) == 0); for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) return (1); } return (0); } #define PG2MB(pgs) (((pgs) + (1 << 8) - 1) >> 8) static int blk_flush(struct dumperinfo *di) { int error; if (fragsz == 0) return (0); error = dump_append(di, dump_va, 0, fragsz); fragsz = 0; return (error); } static struct { int min_per; int max_per; int visited; } progress_track[10] = { { 0, 10, 0}, { 10, 20, 0}, { 20, 30, 0}, { 30, 40, 0}, { 40, 50, 0}, { 50, 60, 0}, { 60, 70, 0}, { 70, 80, 0}, { 80, 90, 0}, { 90, 100, 0} }; static void report_progress(size_t progress, size_t dumpsize) { int sofar, i; sofar = 100 - ((progress * 100) / dumpsize); for (i = 0; i < nitems(progress_track); i++) { if (sofar < progress_track[i].min_per || sofar > progress_track[i].max_per) continue; if (progress_track[i].visited) return; progress_track[i].visited = 1; printf("..%d%%", sofar); return; } } /* Pat the watchdog approximately every 128MB of the dump. */ #define WDOG_DUMP_INTERVAL (128 * 1024 * 1024) static int blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz) { size_t len; int error, i, c; u_int maxdumpsz; maxdumpsz = min(di->maxiosize, MAXDUMPPGS * PAGE_SIZE); if (maxdumpsz == 0) /* seatbelt */ maxdumpsz = PAGE_SIZE; error = 0; if ((sz % PAGE_SIZE) != 0) { printf("size not page aligned\n"); return (EINVAL); } if (ptr != NULL && pa != 0) { printf("cant have both va and pa!\n"); return (EINVAL); } if ((((uintptr_t)pa) % PAGE_SIZE) != 0) { printf("address not page aligned %p\n", ptr); return (EINVAL); } if (ptr != NULL) { /* If we're doing a virtual dump, flush any pre-existing pa pages */ error = blk_flush(di); if (error) return (error); } while (sz) { len = maxdumpsz - fragsz; if (len > sz) len = sz; counter += len; progress -= len; if (counter >> 24) { report_progress(progress, dumpsize); counter &= (1<<24) - 1; } if (progress <= wdog_next) { wdog_kern_pat(WD_LASTVAL); if (wdog_next > WDOG_DUMP_INTERVAL) wdog_next -= WDOG_DUMP_INTERVAL; else wdog_next = 0; } if (ptr) { error = dump_append(di, ptr, 0, len); if (error) return (error); ptr += len; sz -= len; } else { for (i = 0; i < len; i += PAGE_SIZE) dump_va = pmap_kenter_temporary(pa + i, (i + fragsz) >> PAGE_SHIFT); fragsz += len; pa += len; sz -= len; if (fragsz == maxdumpsz) { error = blk_flush(di); if (error) return (error); } } /* Check for user abort. */ c = cncheckc(); if (c == 0x03) return (ECANCELED); if (c != -1) printf(" (CTRL-C to abort) "); } return (0); } /* A fake page table page, to avoid having to handle both 4K and 2M pages */ static pd_entry_t fakepd[NPDEPG]; int minidumpsys(struct dumperinfo *di) { uint32_t pmapsize; vm_offset_t va; int error; uint64_t *pml4, *pdp, *pd, *pt, pa; int i, ii, j, k, n; int retry_count; struct minidumphdr mdhdr; retry_count = 0; retry: retry_count++; counter = 0; for (i = 0; i < nitems(progress_track); i++) progress_track[i].visited = 0; /* Walk page table pages, set bits in vm_page_dump */ pmapsize = 0; for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + nkpt * NBPDR, kernel_vm_end); ) { /* * We always write a page, even if it is zero. Each * page written corresponds to 1GB of space */ pmapsize += PAGE_SIZE; ii = pmap_pml4e_index(va); pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii; pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); i = pmap_pdpe_index(va); if ((pdp[i] & PG_V) == 0) { va += NBPDP; continue; } /* * 1GB page is represented as 512 2MB pages in a dump. */ if ((pdp[i] & PG_PS) != 0) { va += NBPDP; pa = pdp[i] & PG_PS_FRAME; for (n = 0; n < NPDEPG * NPTEPG; n++) { if (is_dumpable(pa)) dump_add_page(pa); pa += PAGE_SIZE; } continue; } pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME); for (n = 0; n < NPDEPG; n++, va += NBPDR) { j = pmap_pde_index(va); if ((pd[j] & PG_V) == 0) continue; if ((pd[j] & PG_PS) != 0) { /* This is an entire 2M page. */ pa = pd[j] & PG_PS_FRAME; for (k = 0; k < NPTEPG; k++) { if (is_dumpable(pa)) dump_add_page(pa); pa += PAGE_SIZE; } continue; } pa = pd[j] & PG_FRAME; /* set bit for this PTE page */ if (is_dumpable(pa)) dump_add_page(pa); /* and for each valid page in this 2MB block */ pt = (uint64_t *)PHYS_TO_DMAP(pd[j] & PG_FRAME); for (k = 0; k < NPTEPG; k++) { if ((pt[k] & PG_V) == 0) continue; pa = pt[k] & PG_FRAME; if (is_dumpable(pa)) dump_add_page(pa); } } } /* Calculate dump size. */ dumpsize = pmapsize; dumpsize += round_page(msgbufp->msg_size); dumpsize += round_page(sizeof(dump_avail)); dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages)); VM_PAGE_DUMP_FOREACH(pa) { /* Clear out undumpable pages now if needed */ if (is_dumpable(pa)) { dumpsize += PAGE_SIZE; } else { dump_drop_page(pa); } } dumpsize += PAGE_SIZE; wdog_next = progress = dumpsize; /* Initialize mdhdr */ bzero(&mdhdr, sizeof(mdhdr)); strcpy(mdhdr.magic, MINIDUMP_MAGIC); mdhdr.version = MINIDUMP_VERSION; mdhdr.msgbufsize = msgbufp->msg_size; mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); mdhdr.pmapsize = pmapsize; mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS; mdhdr.dmapbase = DMAP_MIN_ADDRESS; mdhdr.dmapend = DMAP_MAX_ADDRESS; mdhdr.dumpavailsize = round_page(sizeof(dump_avail)); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_AMD64_VERSION, dumpsize); error = dump_start(di, &kdh); if (error != 0) goto fail; printf("Dumping %llu out of %ju MB:", (long long)dumpsize >> 20, ptoa((uintmax_t)physmem) / 1048576); /* Dump my header */ bzero(&fakepd, sizeof(fakepd)); bcopy(&mdhdr, &fakepd, sizeof(mdhdr)); error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE); if (error) goto fail; /* Dump msgbuf up front */ error = blk_write(di, (char *)msgbufp->msg_ptr, 0, round_page(msgbufp->msg_size)); if (error) goto fail; /* Dump dump_avail */ _Static_assert(sizeof(dump_avail) <= sizeof(fakepd), "Large dump_avail not handled"); bzero(&fakepd, sizeof(fakepd)); memcpy(fakepd, dump_avail, sizeof(dump_avail)); error = blk_write(di, (char *)fakepd, 0, PAGE_SIZE); if (error) goto fail; /* Dump bitmap */ error = blk_write(di, (char *)vm_page_dump, 0, round_page(BITSET_SIZE(vm_page_dump_pages))); if (error) goto fail; /* Dump kernel page directory pages */ bzero(fakepd, sizeof(fakepd)); for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + nkpt * NBPDR, kernel_vm_end); va += NBPDP) { ii = pmap_pml4e_index(va); pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii; pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); i = pmap_pdpe_index(va); /* We always write a page, even if it is zero */ if ((pdp[i] & PG_V) == 0) { error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE); if (error) goto fail; /* flush, in case we reuse fakepd in the same block */ error = blk_flush(di); if (error) goto fail; continue; } /* 1GB page is represented as 512 2MB pages in a dump */ if ((pdp[i] & PG_PS) != 0) { /* PDPE and PDP have identical layout in this case */ fakepd[0] = pdp[i]; for (j = 1; j < NPDEPG; j++) fakepd[j] = fakepd[j - 1] + NBPDR; error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE); if (error) goto fail; /* flush, in case we reuse fakepd in the same block */ error = blk_flush(di); if (error) goto fail; bzero(fakepd, sizeof(fakepd)); continue; } pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME); error = blk_write(di, (char *)pd, 0, PAGE_SIZE); if (error) goto fail; error = blk_flush(di); if (error) goto fail; } /* Dump memory chunks */ VM_PAGE_DUMP_FOREACH(pa) { error = blk_write(di, 0, pa, PAGE_SIZE); if (error) goto fail; } error = blk_flush(di); if (error) goto fail; error = dump_finish(di, &kdh); if (error != 0) goto fail; printf("\nDump complete\n"); return (0); fail: if (error < 0) error = -error; printf("\n"); if (error == ENOSPC) { printf("Dump map grown while dumping. "); if (retry_count < dump_retry_count) { printf("Retrying...\n"); goto retry; } printf("Dump failed.\n"); } else if (error == ECANCELED) printf("Dump aborted\n"); else if (error == E2BIG) { printf("Dump failed. Partition too small (about %lluMB were " "needed this time).\n", (long long)dumpsize >> 20); } else printf("** DUMP FAILED (ERROR %d) **\n", error); return (error); } Index: head/sys/amd64/amd64/pmap.c =================================================================== --- head/sys/amd64/amd64/pmap.c (revision 366710) +++ head/sys/amd64/amd64/pmap.c (revision 366711) @@ -1,11427 +1,11428 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * Copyright (c) 2014-2020 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Portions of this software were developed by * Konstantin Belousov under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #define AMD64_NPT_AWARE #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_ddb.h" #include "opt_pmap.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #ifdef NUMA #define PMAP_MEMDOM MAXMEMDOM #else #define PMAP_MEMDOM 1 #endif static __inline boolean_t pmap_type_guest(pmap_t pmap) { return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); } static __inline boolean_t pmap_emulate_ad_bits(pmap_t pmap) { return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); } static __inline pt_entry_t pmap_valid_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_V; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_EMUL_V; else mask = EPT_PG_READ; break; default: panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_rw_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_RW; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_EMUL_RW; else mask = EPT_PG_WRITE; break; default: panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static pt_entry_t pg_g; static __inline pt_entry_t pmap_global_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: mask = pg_g; break; case PT_RVI: case PT_EPT: mask = 0; break; default: panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_accessed_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_A; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_READ; else mask = EPT_PG_A; break; default: panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_modified_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_M; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_WRITE; else mask = EPT_PG_M; break; default: panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_pku_mask_bit(pmap_t pmap) { return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); } #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #undef pa_index #ifdef NUMA #define pa_index(pa) ({ \ KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end, \ ("address %lx beyond the last segment", (pa))); \ (pa) >> PDRSHIFT; \ }) #define pa_to_pmdp(pa) (&pv_table[pa_index(pa)]) #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) #define PHYS_TO_PV_LIST_LOCK(pa) ({ \ struct rwlock *_lock; \ if (__predict_false((pa) > pmap_last_pa)) \ _lock = &pv_dummy_large.pv_lock; \ else \ _lock = &(pa_to_pmdp(pa)->pv_lock); \ _lock; \ }) #else #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #endif #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int nkpt; SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, "Number of kernel page table pages allocated on bootup"); static int ndmpdp; vm_paddr_t dmaplimit; vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; pt_entry_t pg_nx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM/pmap parameters"); static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); int __read_frequently la57 = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &la57, 0, "5-level paging for host is enabled"); static bool pmap_is_la57(pmap_t pmap) { if (pmap->pm_type == PT_X86) return (la57); return (false); /* XXXKIB handle EPT */ } #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ static u_int64_t KPTphys; /* phys addr of kernel level 1 */ static u_int64_t KPDphys; /* phys addr of kernel level 2 */ u_int64_t KPDPphys; /* phys addr of kernel level 3 */ u_int64_t KPML4phys; /* phys addr of kernel level 4 */ u_int64_t KPML5phys; /* phys addr of kernel level 5, if supported */ static pml4_entry_t *kernel_pml4; static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ static int ndmpdpphys; /* number of DMPDPphys pages */ static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; /* * Data for the pv entry allocation mechanism. * Updates to pv_invl_gen are protected by the pv list lock but reads are not. */ #ifdef NUMA static __inline int pc_to_domain(struct pv_chunk *pc) { return (_vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); } #else static __inline int pc_to_domain(struct pv_chunk *pc __unused) { return (0); } #endif struct pv_chunks_list { struct mtx pvc_lock; TAILQ_HEAD(pch, pv_chunk) pvc_list; int active_reclaims; } __aligned(CACHE_LINE_SIZE); struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; #ifdef NUMA struct pmap_large_md_page { struct rwlock pv_lock; struct md_page pv_page; u_long pv_invl_gen; }; __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; #define pv_dummy pv_dummy_large.pv_page __read_mostly static struct pmap_large_md_page *pv_table; __read_mostly vm_paddr_t pmap_last_pa; #else static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; static u_long pv_invl_gen[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; #endif /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = NULL; caddr_t CADDR1 = 0; static vm_offset_t qframe = 0; static struct mtx qframe_mtx; static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ static vmem_t *large_vmem; static u_int lm_ents; #define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \ (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) int pmap_pcid_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); int invpcid_works = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, "Is the invpcid instruction available ?"); int __read_frequently pti = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pti, 0, "Page Table Isolation enabled"); static vm_object_t pti_obj; static pml4_entry_t *pti_pml4; static vm_pindex_t pti_pg_idx; static bool pti_finalized; struct pmap_pkru_range { struct rs_el pkru_rs_el; u_int pkru_keyidx; int pkru_flags; }; static uma_zone_t pmap_pkru_ranges_zone; static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); static void *pkru_dup_range(void *ctx, void *data); static void pkru_free_range(void *ctx, void *node); static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); static void pmap_pkru_deassign_all(pmap_t pmap); static int pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) { int i; uint64_t res; res = 0; CPU_FOREACH(i) { res += cpuid_to_pcpu[i]->pc_pm_save_cnt; } return (sysctl_handle_64(oidp, &res, 0, req)); } SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", "Count of saved TLB context on switch"); static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); static struct mtx invl_gen_mtx; /* Fake lock object to satisfy turnstiles interface. */ static struct lock_object invl_gen_ts = { .lo_name = "invlts", }; static struct pmap_invl_gen pmap_invl_gen_head = { .gen = 1, .next = NULL, }; static u_long pmap_invl_gen = 1; static int pmap_invl_waiters; static struct callout pmap_invl_callout; static bool pmap_invl_callout_inited; #define PMAP_ASSERT_NOT_IN_DI() \ KASSERT(pmap_not_in_di(), ("DI already started")) static bool pmap_di_locked(void) { int tun; if ((cpu_feature2 & CPUID2_CX16) == 0) return (true); tun = 0; TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun); return (tun != 0); } static int sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS) { int locked; locked = pmap_di_locked(); return (sysctl_handle_int(oidp, &locked, 0, req)); } SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN | CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "", "Locked delayed invalidation"); static bool pmap_not_in_di_l(void); static bool pmap_not_in_di_u(void); DEFINE_IFUNC(, bool, pmap_not_in_di, (void)) { return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u); } static bool pmap_not_in_di_l(void) { struct pmap_invl_gen *invl_gen; invl_gen = &curthread->td_md.md_invl_gen; return (invl_gen->gen == 0); } static void pmap_thread_init_invl_gen_l(struct thread *td) { struct pmap_invl_gen *invl_gen; invl_gen = &td->td_md.md_invl_gen; invl_gen->gen = 0; } static void pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen) { struct turnstile *ts; ts = turnstile_trywait(&invl_gen_ts); if (*m_gen > atomic_load_long(invl_gen)) turnstile_wait(ts, NULL, TS_SHARED_QUEUE); else turnstile_cancel(ts); } static void pmap_delayed_invl_finish_unblock(u_long new_gen) { struct turnstile *ts; turnstile_chain_lock(&invl_gen_ts); ts = turnstile_lookup(&invl_gen_ts); if (new_gen != 0) pmap_invl_gen = new_gen; if (ts != NULL) { turnstile_broadcast(ts, TS_SHARED_QUEUE); turnstile_unpend(ts); } turnstile_chain_unlock(&invl_gen_ts); } /* * Start a new Delayed Invalidation (DI) block of code, executed by * the current thread. Within a DI block, the current thread may * destroy both the page table and PV list entries for a mapping and * then release the corresponding PV list lock before ensuring that * the mapping is flushed from the TLBs of any processors with the * pmap active. */ static void pmap_delayed_invl_start_l(void) { struct pmap_invl_gen *invl_gen; u_long currgen; invl_gen = &curthread->td_md.md_invl_gen; PMAP_ASSERT_NOT_IN_DI(); mtx_lock(&invl_gen_mtx); if (LIST_EMPTY(&pmap_invl_gen_tracker)) currgen = pmap_invl_gen; else currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; invl_gen->gen = currgen + 1; LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); mtx_unlock(&invl_gen_mtx); } /* * Finish the DI block, previously started by the current thread. All * required TLB flushes for the pages marked by * pmap_delayed_invl_page() must be finished before this function is * called. * * This function works by bumping the global DI generation number to * the generation number of the current thread's DI, unless there is a * pending DI that started earlier. In the latter case, bumping the * global DI generation number would incorrectly signal that the * earlier DI had finished. Instead, this function bumps the earlier * DI's generation number to match the generation number of the * current thread's DI. */ static void pmap_delayed_invl_finish_l(void) { struct pmap_invl_gen *invl_gen, *next; invl_gen = &curthread->td_md.md_invl_gen; KASSERT(invl_gen->gen != 0, ("missed invl_start")); mtx_lock(&invl_gen_mtx); next = LIST_NEXT(invl_gen, link); if (next == NULL) pmap_delayed_invl_finish_unblock(invl_gen->gen); else next->gen = invl_gen->gen; LIST_REMOVE(invl_gen, link); mtx_unlock(&invl_gen_mtx); invl_gen->gen = 0; } static bool pmap_not_in_di_u(void) { struct pmap_invl_gen *invl_gen; invl_gen = &curthread->td_md.md_invl_gen; return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0); } static void pmap_thread_init_invl_gen_u(struct thread *td) { struct pmap_invl_gen *invl_gen; invl_gen = &td->td_md.md_invl_gen; invl_gen->gen = 0; invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID; } static bool pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out) { uint64_t new_high, new_low, old_high, old_low; char res; old_low = new_low = 0; old_high = new_high = (uintptr_t)0; __asm volatile("lock;cmpxchg16b\t%1" : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) : "b"(new_low), "c" (new_high) : "memory", "cc"); if (res == 0) { if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0) return (false); out->gen = old_low; out->next = (void *)old_high; } else { out->gen = new_low; out->next = (void *)new_high; } return (true); } static bool pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val, struct pmap_invl_gen *new_val) { uint64_t new_high, new_low, old_high, old_low; char res; new_low = new_val->gen; new_high = (uintptr_t)new_val->next; old_low = old_val->gen; old_high = (uintptr_t)old_val->next; __asm volatile("lock;cmpxchg16b\t%1" : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) : "b"(new_low), "c" (new_high) : "memory", "cc"); return (res); } #ifdef PV_STATS static long invl_start_restart; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_start_restart, CTLFLAG_RD, &invl_start_restart, 0, ""); static long invl_finish_restart; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD, &invl_finish_restart, 0, ""); static int invl_max_qlen; SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD, &invl_max_qlen, 0, ""); #endif #define di_delay locks_delay static void pmap_delayed_invl_start_u(void) { struct pmap_invl_gen *invl_gen, *p, prev, new_prev; struct thread *td; struct lock_delay_arg lda; uintptr_t prevl; u_char pri; #ifdef PV_STATS int i, ii; #endif td = curthread; invl_gen = &td->td_md.md_invl_gen; PMAP_ASSERT_NOT_IN_DI(); lock_delay_arg_init(&lda, &di_delay); invl_gen->saved_pri = 0; pri = td->td_base_pri; if (pri > PVM) { thread_lock(td); pri = td->td_base_pri; if (pri > PVM) { invl_gen->saved_pri = pri; sched_prio(td, PVM); } thread_unlock(td); } again: PV_STAT(i = 0); for (p = &pmap_invl_gen_head;; p = prev.next) { PV_STAT(i++); prevl = (uintptr_t)atomic_load_ptr(&p->next); if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { PV_STAT(atomic_add_long(&invl_start_restart, 1)); lock_delay(&lda); goto again; } if (prevl == 0) break; prev.next = (void *)prevl; } #ifdef PV_STATS if ((ii = invl_max_qlen) < i) atomic_cmpset_int(&invl_max_qlen, ii, i); #endif if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) { PV_STAT(atomic_add_long(&invl_start_restart, 1)); lock_delay(&lda); goto again; } new_prev.gen = prev.gen; new_prev.next = invl_gen; invl_gen->gen = prev.gen + 1; /* Formal fence between store to invl->gen and updating *p. */ atomic_thread_fence_rel(); /* * After inserting an invl_gen element with invalid bit set, * this thread blocks any other thread trying to enter the * delayed invalidation block. Do not allow to remove us from * the CPU, because it causes starvation for other threads. */ critical_enter(); /* * ABA for *p is not possible there, since p->gen can only * increase. So if the *p thread finished its di, then * started a new one and got inserted into the list at the * same place, its gen will appear greater than the previously * read gen. */ if (!pmap_di_store_invl(p, &prev, &new_prev)) { critical_exit(); PV_STAT(atomic_add_long(&invl_start_restart, 1)); lock_delay(&lda); goto again; } /* * There we clear PMAP_INVL_GEN_NEXT_INVALID in * invl_gen->next, allowing other threads to iterate past us. * pmap_di_store_invl() provides fence between the generation * write and the update of next. */ invl_gen->next = NULL; critical_exit(); } static bool pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen, struct pmap_invl_gen *p) { struct pmap_invl_gen prev, new_prev; u_long mygen; /* * Load invl_gen->gen after setting invl_gen->next * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger * generations to propagate to our invl_gen->gen. Lock prefix * in atomic_set_ptr() worked as seq_cst fence. */ mygen = atomic_load_long(&invl_gen->gen); if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen) return (false); KASSERT(prev.gen < mygen, ("invalid di gen sequence %lu %lu", prev.gen, mygen)); new_prev.gen = mygen; new_prev.next = (void *)((uintptr_t)invl_gen->next & ~PMAP_INVL_GEN_NEXT_INVALID); /* Formal fence between load of prev and storing update to it. */ atomic_thread_fence_rel(); return (pmap_di_store_invl(p, &prev, &new_prev)); } static void pmap_delayed_invl_finish_u(void) { struct pmap_invl_gen *invl_gen, *p; struct thread *td; struct lock_delay_arg lda; uintptr_t prevl; td = curthread; invl_gen = &td->td_md.md_invl_gen; KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0")); KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0, ("missed invl_start: INVALID")); lock_delay_arg_init(&lda, &di_delay); again: for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) { prevl = (uintptr_t)atomic_load_ptr(&p->next); if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { PV_STAT(atomic_add_long(&invl_finish_restart, 1)); lock_delay(&lda); goto again; } if ((void *)prevl == invl_gen) break; } /* * It is legitimate to not find ourself on the list if a * thread before us finished its DI and started it again. */ if (__predict_false(p == NULL)) { PV_STAT(atomic_add_long(&invl_finish_restart, 1)); lock_delay(&lda); goto again; } critical_enter(); atomic_set_ptr((uintptr_t *)&invl_gen->next, PMAP_INVL_GEN_NEXT_INVALID); if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) { atomic_clear_ptr((uintptr_t *)&invl_gen->next, PMAP_INVL_GEN_NEXT_INVALID); critical_exit(); PV_STAT(atomic_add_long(&invl_finish_restart, 1)); lock_delay(&lda); goto again; } critical_exit(); if (atomic_load_int(&pmap_invl_waiters) > 0) pmap_delayed_invl_finish_unblock(0); if (invl_gen->saved_pri != 0) { thread_lock(td); sched_prio(td, invl_gen->saved_pri); thread_unlock(td); } } #ifdef DDB DB_SHOW_COMMAND(di_queue, pmap_di_queue) { struct pmap_invl_gen *p, *pn; struct thread *td; uintptr_t nextl; bool first; for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn, first = false) { nextl = (uintptr_t)atomic_load_ptr(&p->next); pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID); td = first ? NULL : __containerof(p, struct thread, td_md.md_invl_gen); db_printf("gen %lu inv %d td %p tid %d\n", p->gen, (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td, td != NULL ? td->td_tid : -1); } } #endif #ifdef PV_STATS static long invl_wait; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0, "Number of times DI invalidation blocked pmap_remove_all/write"); static long invl_wait_slow; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, &invl_wait_slow, 0, "Number of slow invalidation waits for lockless DI"); #endif #ifdef NUMA static u_long * pmap_delayed_invl_genp(vm_page_t m) { vm_paddr_t pa; u_long *gen; pa = VM_PAGE_TO_PHYS(m); if (__predict_false((pa) > pmap_last_pa)) gen = &pv_dummy_large.pv_invl_gen; else gen = &(pa_to_pmdp(pa)->pv_invl_gen); return (gen); } #else static u_long * pmap_delayed_invl_genp(vm_page_t m) { return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); } #endif static void pmap_delayed_invl_callout_func(void *arg __unused) { if (atomic_load_int(&pmap_invl_waiters) == 0) return; pmap_delayed_invl_finish_unblock(0); } static void pmap_delayed_invl_callout_init(void *arg __unused) { if (pmap_di_locked()) return; callout_init(&pmap_invl_callout, 1); pmap_invl_callout_inited = true; } SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_delayed_invl_callout_init, NULL); /* * Ensure that all currently executing DI blocks, that need to flush * TLB for the given page m, actually flushed the TLB at the time the * function returned. If the page m has an empty PV list and we call * pmap_delayed_invl_wait(), upon its return we know that no CPU has a * valid mapping for the page m in either its page table or TLB. * * This function works by blocking until the global DI generation * number catches up with the generation number associated with the * given page m and its PV list. Since this function's callers * typically own an object lock and sometimes own a page lock, it * cannot sleep. Instead, it blocks on a turnstile to relinquish the * processor. */ static void pmap_delayed_invl_wait_l(vm_page_t m) { u_long *m_gen; #ifdef PV_STATS bool accounted = false; #endif m_gen = pmap_delayed_invl_genp(m); while (*m_gen > pmap_invl_gen) { #ifdef PV_STATS if (!accounted) { atomic_add_long(&invl_wait, 1); accounted = true; } #endif pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen); } } static void pmap_delayed_invl_wait_u(vm_page_t m) { u_long *m_gen; struct lock_delay_arg lda; bool fast; fast = true; m_gen = pmap_delayed_invl_genp(m); lock_delay_arg_init(&lda, &di_delay); while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { if (fast || !pmap_invl_callout_inited) { PV_STAT(atomic_add_long(&invl_wait, 1)); lock_delay(&lda); fast = false; } else { /* * The page's invalidation generation number * is still below the current thread's number. * Prepare to block so that we do not waste * CPU cycles or worse, suffer livelock. * * Since it is impossible to block without * racing with pmap_delayed_invl_finish_u(), * prepare for the race by incrementing * pmap_invl_waiters and arming a 1-tick * callout which will unblock us if we lose * the race. */ atomic_add_int(&pmap_invl_waiters, 1); /* * Re-check the current thread's invalidation * generation after incrementing * pmap_invl_waiters, so that there is no race * with pmap_delayed_invl_finish_u() setting * the page generation and checking * pmap_invl_waiters. The only race allowed * is for a missed unblock, which is handled * by the callout. */ if (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { callout_reset(&pmap_invl_callout, 1, pmap_delayed_invl_callout_func, NULL); PV_STAT(atomic_add_long(&invl_wait_slow, 1)); pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen_head.gen); } atomic_add_int(&pmap_invl_waiters, -1); } } } DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *)) { return (pmap_di_locked() ? pmap_thread_init_invl_gen_l : pmap_thread_init_invl_gen_u); } DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void)) { return (pmap_di_locked() ? pmap_delayed_invl_start_l : pmap_delayed_invl_start_u); } DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void)) { return (pmap_di_locked() ? pmap_delayed_invl_finish_l : pmap_delayed_invl_finish_u); } DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t)) { return (pmap_di_locked() ? pmap_delayed_invl_wait_l : pmap_delayed_invl_wait_u); } /* * Mark the page m's PV list as participating in the current thread's * DI block. Any threads concurrently using m's PV list to remove or * restrict all mappings to m will wait for the current thread's DI * block to complete before proceeding. * * The function works by setting the DI generation number for m's PV * list to at least the DI generation number of the current thread. * This forces a caller of pmap_delayed_invl_wait() to block until * current thread calls pmap_delayed_invl_finish(). */ static void pmap_delayed_invl_page(vm_page_t m) { u_long gen, *m_gen; rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); gen = curthread->td_md.md_invl_gen.gen; if (gen == 0) return; m_gen = pmap_delayed_invl_genp(m); if (*m_gen < gen) *m_gen = gen; } /* * Crashdump maps. */ static caddr_t crashdumpmap; /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ /* * Internal flags for pmap_mapdev_internal() and * pmap_change_props_locked(). */ #define MAPDEV_FLUSHCACHE 0x00000001 /* Flush cache after mapping. */ #define MAPDEV_SETATTR 0x00000002 /* Modify existing attrs. */ #define MAPDEV_ASSERTVALID 0x00000004 /* Assert mapping validity. */ TAILQ_HEAD(pv_chunklist, pv_chunk); static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_chunk_batch(struct pv_chunklist *batch); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static int popcnt_pc_map_pq(uint64_t *map); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, struct rwlock **lockp); #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); #endif static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, int mode, int flags); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va); static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp); static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva); static void pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static vm_page_t pmap_large_map_getptp_unlocked(void); static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); #if VM_NRESERVLEVEL > 0 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); #endif static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask); static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec); static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); static pd_entry_t *pmap_pti_pde(vm_offset_t va); static void pmap_pti_wire_pte(void *pte); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free); static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pd_entry_t *pde, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, vm_offset_t va); static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, struct rwlock **lockp); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); /********************/ /* Inline functions */ /********************/ /* * Return a non-clipped indexes for a given VA, which are page table * pages indexes at the corresponding level. */ static __inline vm_pindex_t pmap_pde_pindex(vm_offset_t va) { return (va >> PDRSHIFT); } static __inline vm_pindex_t pmap_pdpe_pindex(vm_offset_t va) { return (NUPDE + (va >> PDPSHIFT)); } static __inline vm_pindex_t pmap_pml4e_pindex(vm_offset_t va) { return (NUPDE + NUPDPE + (va >> PML4SHIFT)); } static __inline vm_pindex_t pmap_pml5e_pindex(vm_offset_t va) { return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT)); } static __inline pml4_entry_t * pmap_pml5e(pmap_t pmap, vm_offset_t va) { MPASS(pmap_is_la57(pmap)); return (&pmap->pm_pmltop[pmap_pml5e_index(va)]); } static __inline pml4_entry_t * pmap_pml5e_u(pmap_t pmap, vm_offset_t va) { MPASS(pmap_is_la57(pmap)); return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]); } static __inline pml4_entry_t * pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va) { pml4_entry_t *pml4e; /* XXX MPASS(pmap_is_la57(pmap); */ pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); return (&pml4e[pmap_pml4e_index(va)]); } /* Return a pointer to the PML4 slot that corresponds to a VA */ static __inline pml4_entry_t * pmap_pml4e(pmap_t pmap, vm_offset_t va) { pml5_entry_t *pml5e; pml4_entry_t *pml4e; pt_entry_t PG_V; if (pmap_is_la57(pmap)) { pml5e = pmap_pml5e(pmap, va); PG_V = pmap_valid_bit(pmap); if ((*pml5e & PG_V) == 0) return (NULL); pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); } else { pml4e = pmap->pm_pmltop; } return (&pml4e[pmap_pml4e_index(va)]); } static __inline pml4_entry_t * pmap_pml4e_u(pmap_t pmap, vm_offset_t va) { MPASS(!pmap_is_la57(pmap)); return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) { pdp_entry_t *pdpe; pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); return (&pdpe[pmap_pdpe_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pdpe(pmap_t pmap, vm_offset_t va) { pml4_entry_t *pml4e; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pml4e = pmap_pml4e(pmap, va); if (pml4e == NULL || (*pml4e & PG_V) == 0) return (NULL); return (pmap_pml4e_to_pdpe(pml4e, va)); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) { pd_entry_t *pde; KASSERT((*pdpe & PG_PS) == 0, ("%s: pdpe %#lx is a leaf", __func__, *pdpe)); pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); return (&pde[pmap_pde_index(va)]); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe == NULL || (*pdpe & PG_V) == 0) return (NULL); KASSERT((*pdpe & PG_PS) == 0, ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va)); return (pmap_pdpe_to_pde(pdpe, va)); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) { pt_entry_t *pte; KASSERT((*pde & PG_PS) == 0, ("%s: pde %#lx is a leaf", __func__, *pde)); pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); return (&pte[pmap_pte_index(va)]); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) return (NULL); if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ return ((pt_entry_t *)pde); return (pmap_pde_to_pte(pde, va)); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } PMAP_INLINE pt_entry_t * vtopte(vm_offset_t va) { u_int64_t mask; KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); if (la57) { mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1); return (P5Tmap + ((va >> PAGE_SHIFT) & mask)); } else { mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); return (P4Tmap + ((va >> PAGE_SHIFT) & mask)); } } static __inline pd_entry_t * vtopde(vm_offset_t va) { u_int64_t mask; KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); if (la57) { mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1); return (P5Dmap + ((va >> PDRSHIFT) & mask)); } else { mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); return (P4Dmap + ((va >> PDRSHIFT) & mask)); } } static u_int64_t allocpages(vm_paddr_t *firstaddr, int n) { u_int64_t ret; ret = *firstaddr; bzero((void *)ret, n * PAGE_SIZE); *firstaddr += n * PAGE_SIZE; return (ret); } CTASSERT(powerof2(NDMPML4E)); /* number of kernel PDP slots */ #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) static void nkpt_init(vm_paddr_t addr) { int pt_pages; #ifdef NKPT pt_pages = NKPT; #else pt_pages = howmany(addr, 1 << PDRSHIFT); pt_pages += NKPDPE(pt_pages); /* * Add some slop beyond the bare minimum required for bootstrapping * the kernel. * * This is quite important when allocating KVA for kernel modules. * The modules are required to be linked in the negative 2GB of * the address space. If we run out of KVA in this region then * pmap_growkernel() will need to allocate page table pages to map * the entire 512GB of KVA space which is an unnecessary tax on * physical memory. * * Secondly, device memory mapped as part of setting up the low- * level console(s) is taken from KVA, starting at virtual_avail. * This is because cninit() is called after pmap_bootstrap() but * before vm_init() and pmap_init(). 20MB for a frame buffer is * not uncommon. */ pt_pages += 32; /* 64MB additional slop. */ #endif nkpt = pt_pages; } /* * Returns the proper write/execute permission for a physical page that is * part of the initial boot allocations. * * If the page has kernel text, it is marked as read-only. If the page has * kernel read-only data, it is marked as read-only/not-executable. If the * page has only read-write data, it is marked as read-write/not-executable. * If the page is below/above the kernel range, it is marked as read-write. * * This function operates on 2M pages, since we map the kernel space that * way. */ static inline pt_entry_t bootaddr_rwx(vm_paddr_t pa) { /* * The kernel is loaded at a 2MB-aligned address, and memory below that * need not be executable. The .bss section is padded to a 2MB * boundary, so memory following the kernel need not be executable * either. Preloaded kernel modules have their mapping permissions * fixed up by the linker. */ if (pa < trunc_2mpage(btext - KERNBASE) || pa >= trunc_2mpage(_end - KERNBASE)) return (X86_PG_RW | pg_nx); /* * The linker should ensure that the read-only and read-write * portions don't share the same 2M page, so this shouldn't * impact read-only data. However, in any case, any page with * read-write data needs to be read-write. */ if (pa >= trunc_2mpage(brwsection - KERNBASE)) return (X86_PG_RW | pg_nx); /* * Mark any 2M page containing kernel text as read-only. Mark * other pages with read-only data as read-only and not executable. * (It is likely a small portion of the read-only data section will * be marked as read-only, but executable. This should be acceptable * since the read-only protection will keep the data from changing.) * Note that fixups to the .text section will still work until we * set CR0.WP. */ if (pa < round_2mpage(etext - KERNBASE)) return (0); return (pg_nx); } static void create_pagetables(vm_paddr_t *firstaddr) { int i, j, ndm1g, nkpdpe, nkdmpde; pd_entry_t *pd_p; pdp_entry_t *pdp_p; pml4_entry_t *p4_p; uint64_t DMPDkernphys; /* Allocate page table pages for the direct map */ ndmpdp = howmany(ptoa(Maxmem), NBPDP); if (ndmpdp < 4) /* Minimum 4GB of dirmap */ ndmpdp = 4; ndmpdpphys = howmany(ndmpdp, NPDPEPG); if (ndmpdpphys > NDMPML4E) { /* * Each NDMPML4E allows 512 GB, so limit to that, * and then readjust ndmpdp and ndmpdpphys. */ printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); Maxmem = atop(NDMPML4E * NBPML4); ndmpdpphys = NDMPML4E; ndmpdp = NDMPML4E * NPDEPG; } DMPDPphys = allocpages(firstaddr, ndmpdpphys); ndm1g = 0; if ((amd_feature & AMDID_PAGE1GB) != 0) { /* * Calculate the number of 1G pages that will fully fit in * Maxmem. */ ndm1g = ptoa(Maxmem) >> PDPSHIFT; /* * Allocate 2M pages for the kernel. These will be used in * place of the first one or more 1G pages from ndm1g. */ nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP); DMPDkernphys = allocpages(firstaddr, nkdmpde); } if (ndm1g < ndmpdp) DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; /* Allocate pages */ KPML4phys = allocpages(firstaddr, 1); KPDPphys = allocpages(firstaddr, NKPML4E); /* * Allocate the initial number of kernel page table pages required to * bootstrap. We defer this until after all memory-size dependent * allocations are done (e.g. direct map), so that we don't have to * build in too much slop in our estimate. * * Note that when NKPML4E > 1, we have an empty page underneath * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) * pages. (pmap_enter requires a PD page to exist for each KPML4E.) */ nkpt_init(*firstaddr); nkpdpe = NKPDPE(nkpt); KPTphys = allocpages(firstaddr, nkpt); KPDphys = allocpages(firstaddr, nkpdpe); /* * Connect the zero-filled PT pages to their PD entries. This * implicitly maps the PT pages at their correct locations within * the PTmap. */ pd_p = (pd_entry_t *)KPDphys; for (i = 0; i < nkpt; i++) pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; /* * Map from physical address zero to the end of loader preallocated * memory using 2MB pages. This replaces some of the PD entries * created above. */ for (i = 0; (i << PDRSHIFT) < KERNend; i++) /* Preset PG_M and PG_A because demotion expects it. */ pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT); /* * Because we map the physical blocks in 2M pages, adjust firstaddr * to record the physical blocks we've actually mapped into kernel * virtual address space. */ if (*firstaddr < round_2mpage(KERNend)) *firstaddr = round_2mpage(KERNend); /* And connect up the PD to the PDP (leaving room for L4 pages) */ pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); for (i = 0; i < nkpdpe; i++) pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V; /* * Now, set up the direct map region using 2MB and/or 1GB pages. If * the end of physical memory is not aligned to a 1GB page boundary, * then the residual physical memory is mapped with 2MB pages. Later, * if pmap_mapdev{_attr}() uses the direct map for non-write-back * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings * that are partially used. */ pd_p = (pd_entry_t *)DMPDphys; for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { pd_p[j] = (vm_paddr_t)i << PDRSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx; } pdp_p = (pdp_entry_t *)DMPDPphys; for (i = 0; i < ndm1g; i++) { pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx; } for (j = 0; i < ndmpdp; i++, j++) { pdp_p[i] = DMPDphys + ptoa(j); pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx; } /* * Instead of using a 1G page for the memory containing the kernel, * use 2M pages with read-only and no-execute permissions. (If using 1G * pages, this will partially overwrite the PDPEs above.) */ if (ndm1g) { pd_p = (pd_entry_t *)DMPDkernphys; for (i = 0; i < (NPDEPG * nkdmpde); i++) pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx | bootaddr_rwx(i << PDRSHIFT); for (i = 0; i < nkdmpde; i++) pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW | X86_PG_V | pg_nx; } /* And recursively map PML4 to itself in order to get PTmap */ p4_p = (pml4_entry_t *)KPML4phys; p4_p[PML4PML4I] = KPML4phys; p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; /* Connect the Direct Map slot(s) up to the PML4. */ for (i = 0; i < ndmpdpphys; i++) { p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; } /* Connect the KVA slots up to the PML4 */ for (i = 0; i < NKPML4E; i++) { p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; } kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); } /* * Bootstrap the system enough to run with virtual memory. * * On amd64 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t *firstaddr) { vm_offset_t va; pt_entry_t *pte, *pcpu_pte; struct region_descriptor r_gdt; uint64_t cr4, pcpu_phys; u_long res; int i; KERNend = *firstaddr; res = atop(KERNend - (vm_paddr_t)kernphys); if (!pti) pg_g = X86_PG_G; /* * Create an initial set of page tables to run the kernel in. */ create_pagetables(firstaddr); pcpu_phys = allocpages(firstaddr, MAXCPU); /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_early_add_seg(KPTphys, KPTphys + ptoa(nkpt)); /* * Account for the virtual addresses mapped by create_pagetables(). */ virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend); virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Enable PG_G global pages, then switch to the kernel page * table from the bootstrap page table. After the switch, it * is possible to enable SMEP and SMAP since PG_U bits are * correct now. */ cr4 = rcr4(); cr4 |= CR4_PGE; load_cr4(cr4); load_cr3(KPML4phys); if (cpu_stdext_feature & CPUID_STDEXT_SMEP) cr4 |= CR4_SMEP; if (cpu_stdext_feature & CPUID_STDEXT_SMAP) cr4 |= CR4_SMAP; load_cr4(cr4); /* * Initialize the kernel pmap (which is statically allocated). * Count bootstrap data as being resident in case any of this data is * later unmapped (using pmap_remove()) and freed. */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pmltop = kernel_pml4; kernel_pmap->pm_cr3 = KPML4phys; kernel_pmap->pm_ucr3 = PMAP_NO_CR3; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_stats.resident_count = res; kernel_pmap->pm_flags = pmap_flags; /* * Initialize the TLB invalidations generation number lock. */ mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * Crashdump maps. The first page is reused as CMAP1 for the * memory test. */ SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) CADDR1 = crashdumpmap; SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU); virtual_avail = va; for (i = 0; i < MAXCPU; i++) { pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW | pg_g | pg_nx | X86_PG_M | X86_PG_A; } /* * Re-initialize PCPU area for BSP after switching. * Make hardware use gdt and common_tss from the new PCPU. */ STAILQ_INIT(&cpuhead); wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu)); amd64_bsp_pcpu_init1(&__pcpu[0]); amd64_bsp_ist_init(&__pcpu[0]); __pcpu[0].pc_common_tss.tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; memcpy(__pcpu[0].pc_gdt, temp_bsp_pcpu.pc_gdt, NGDT * sizeof(struct user_segment_descriptor)); gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&__pcpu[0].pc_common_tss; ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; r_gdt.rd_base = (long)__pcpu[0].pc_gdt; lgdt(&r_gdt); wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); ltr(GSEL(GPROC0_SEL, SEL_KPL)); __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic; __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id; /* * Initialize the PAT MSR. * pmap_init_pat() clears and sets CR4_PGE, which, as a * side-effect, invalidates stale PG_G TLB entries that might * have been created in our pre-boot environment. */ pmap_init_pat(); /* Initialize TLB Context Id. */ if (pmap_pcid_enabled) { for (i = 0; i < MAXCPU; i++) { kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; kernel_pmap->pm_pcids[i].pm_gen = 1; } /* * PMAP_PCID_KERN + 1 is used for initialization of * proc0 pmap. The pmap' pcid state might be used by * EFIRT entry before first context switch, so it * needs to be valid. */ PCPU_SET(pcid_next, PMAP_PCID_KERN + 2); PCPU_SET(pcid_gen, 1); /* * pcpu area for APs is zeroed during AP startup. * pc_pcid_next and pc_pcid_gen are initialized by AP * during pcpu setup. */ load_cr4(rcr4() | CR4_PCIDE); } } /* * Setup the PAT MSR. */ void pmap_init_pat(void) { uint64_t pat_msr; u_long cr0, cr4; int i; /* Bail if this CPU doesn't implement PAT. */ if ((cpu_feature & CPUID_PAT) == 0) panic("no PAT??"); /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = -1; pat_index[PAT_WRITE_BACK] = 0; pat_index[PAT_WRITE_THROUGH] = 1; pat_index[PAT_UNCACHEABLE] = 3; pat_index[PAT_WRITE_COMBINING] = 6; pat_index[PAT_WRITE_PROTECTED] = 5; pat_index[PAT_UNCACHED] = 2; /* * Initialize default PAT entries. * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * * Leave 4 and 7 as WB and UC. Note that a recursive page table * mapping for a 2M page uses a PAT value with the bit 3 set due * to its overload with PG_PS. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING) | PAT_VALUE(7, PAT_UNCACHEABLE); /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } extern const char la57_trampoline[], la57_trampoline_gdt_desc[], la57_trampoline_gdt[], la57_trampoline_end[]; static void pmap_bootstrap_la57(void *arg __unused) { char *v_code; pml5_entry_t *v_pml5; pml4_entry_t *v_pml4; pdp_entry_t *v_pdp; pd_entry_t *v_pd; pt_entry_t *v_pt; vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5; void (*la57_tramp)(uint64_t pml5); struct region_descriptor r_gdt; if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0) return; if (!TUNABLE_INT_FETCH("vm.pmap.la57", &la57)) la57 = 1; if (!la57) return; r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; r_gdt.rd_base = (long)__pcpu[0].pc_gdt; m_code = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); if ((m_code->flags & PG_ZERO) == 0) pmap_zero_page(m_code); v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code)); m_pml5 = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); if ((m_pml5->flags & PG_ZERO) == 0) pmap_zero_page(m_pml5); KPML5phys = VM_PAGE_TO_PHYS(m_pml5); v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys); m_pml4 = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); if ((m_pml4->flags & PG_ZERO) == 0) pmap_zero_page(m_pml4); v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); m_pdp = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); if ((m_pdp->flags & PG_ZERO) == 0) pmap_zero_page(m_pdp); v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); m_pd = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); if ((m_pd->flags & PG_ZERO) == 0) pmap_zero_page(m_pd); v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd)); m_pt = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); if ((m_pt->flags & PG_ZERO) == 0) pmap_zero_page(m_pt); v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt)); /* * Map m_code 1:1, it appears below 4G in KVA due to physical * address being below 4G. Since kernel KVA is in upper half, * the pml4e should be zero and free for temporary use. */ kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] = VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] = VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] = VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; /* * Add pml5 entry at top of KVA pointing to existing pml4 table, * entering all existing kernel mappings into level 5 table. */ v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g; /* * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on. */ v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] = VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; /* * Copy and call the 48->57 trampoline, hope we return there, alive. */ bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline); *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) = la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code); la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code); la57_tramp(KPML5phys); /* * gdt was necessary reset, switch back to our gdt. */ lgdt(&r_gdt); wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); ltr(GSEL(GPROC0_SEL, SEL_KPL)); /* * Now unmap the trampoline, and free the pages. * Clear pml5 entry used for 1:1 trampoline mapping. */ pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]); invlpg((vm_offset_t)v_code); vm_page_free(m_code); vm_page_free(m_pdp); vm_page_free(m_pd); vm_page_free(m_pt); /* * Recursively map PML5 to itself in order to get PTmap and * PDmap. */ v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx; kernel_pmap->pm_cr3 = KPML5phys; kernel_pmap->pm_pmltop = v_pml5; } SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL); /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pat_mode = PAT_WRITE_BACK; } static int pmap_allow_2m_x_ept; SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, &pmap_allow_2m_x_ept, 0, "Allow executable superpage mappings in EPT"); void pmap_allow_2m_x_ept_recalculate(void) { /* * SKL002, SKL012S. Since the EPT format is only used by * Intel CPUs, the vendor check is merely a formality. */ if (!(cpu_vendor_id != CPU_VENDOR_INTEL || (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 || (CPUID_TO_FAMILY(cpu_id) == 0x6 && (CPUID_TO_MODEL(cpu_id) == 0x26 || /* Atoms */ CPUID_TO_MODEL(cpu_id) == 0x27 || CPUID_TO_MODEL(cpu_id) == 0x35 || CPUID_TO_MODEL(cpu_id) == 0x36 || CPUID_TO_MODEL(cpu_id) == 0x37 || CPUID_TO_MODEL(cpu_id) == 0x86 || CPUID_TO_MODEL(cpu_id) == 0x1c || CPUID_TO_MODEL(cpu_id) == 0x4a || CPUID_TO_MODEL(cpu_id) == 0x4c || CPUID_TO_MODEL(cpu_id) == 0x4d || CPUID_TO_MODEL(cpu_id) == 0x5a || CPUID_TO_MODEL(cpu_id) == 0x5c || CPUID_TO_MODEL(cpu_id) == 0x5d || CPUID_TO_MODEL(cpu_id) == 0x5f || CPUID_TO_MODEL(cpu_id) == 0x6e || CPUID_TO_MODEL(cpu_id) == 0x7a || CPUID_TO_MODEL(cpu_id) == 0x57 || /* Knights */ CPUID_TO_MODEL(cpu_id) == 0x85)))) pmap_allow_2m_x_ept = 1; TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept); } static bool pmap_allow_2m_x_page(pmap_t pmap, bool executable) { return (pmap->pm_type != PT_EPT || !executable || !pmap_allow_2m_x_ept); } #ifdef NUMA static void pmap_init_pv_table(void) { struct pmap_large_md_page *pvd; vm_size_t s; long start, end, highest, pv_npg; int domain, i, j, pages; /* * We strongly depend on the size being a power of two, so the assert * is overzealous. However, should the struct be resized to a * different power of two, the code below needs to be revisited. */ CTASSERT((sizeof(*pvd) == 64)); /* * Calculate the size of the array. */ pmap_last_pa = vm_phys_segs[vm_phys_nsegs - 1].end; pv_npg = howmany(pmap_last_pa, NBPDR); s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page); s = round_page(s); pv_table = (struct pmap_large_md_page *)kva_alloc(s); if (pv_table == NULL) panic("%s: kva_alloc failed\n", __func__); /* * Iterate physical segments to allocate space for respective pages. */ highest = -1; s = 0; for (i = 0; i < vm_phys_nsegs; i++) { end = vm_phys_segs[i].end / NBPDR; domain = vm_phys_segs[i].domain; if (highest >= end) continue; start = highest + 1; pvd = &pv_table[start]; pages = end - start + 1; s = round_page(pages * sizeof(*pvd)); highest = start + (s / sizeof(*pvd)) - 1; for (j = 0; j < s; j += PAGE_SIZE) { vm_page_t m = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ); if (m == NULL) panic("vm_page_alloc_domain failed for %lx\n", (vm_offset_t)pvd + j); pmap_qenter((vm_offset_t)pvd + j, &m, 1); } for (j = 0; j < s / sizeof(*pvd); j++) { rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); TAILQ_INIT(&pvd->pv_page.pv_list); pvd->pv_page.pv_gen = 0; pvd->pv_page.pat_mode = 0; pvd->pv_invl_gen = 0; pvd++; } } pvd = &pv_dummy_large; rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); TAILQ_INIT(&pvd->pv_page.pv_list); pvd->pv_page.pv_gen = 0; pvd->pv_page.pat_mode = 0; pvd->pv_invl_gen = 0; } #else static void pmap_init_pv_table(void) { vm_size_t s; long i, pv_npg; /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)pv_npg * sizeof(struct md_page); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); } #endif /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t m, mpte; int error, i, ret, skz63; /* L1TF, reserve page @0 unconditionally */ vm_page_blacklist_add(0, bootverbose); /* Detect bare-metal Skylake Server and Skylake-X. */ if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { /* * Skylake-X errata SKZ63. Processor May Hang When * Executing Code In an HLE Transaction Region between * 40000000H and 403FFFFFH. * * Mark the pages in the range as preallocated. It * seems to be impossible to distinguish between * Skylake Server and Skylake X. */ skz63 = 1; TUNABLE_INT_FETCH("hw.skz63_enable", &skz63); if (skz63 != 0) { if (bootverbose) printf("SKZ63: skipping 4M RAM starting " "at physical 1G\n"); for (i = 0; i < atop(0x400000); i++) { ret = vm_page_blacklist_add(0x40000000 + ptoa(i), FALSE); if (!ret && bootverbose) printf("page at %#lx already used\n", 0x40000000 + ptoa(i)); } } } /* IFU */ pmap_allow_2m_x_ept_recalculate(); /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ PMAP_LOCK(kernel_pmap); for (i = 0; i < nkpt; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = pmap_pde_pindex(KERNBASE) + i; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); mpte->ref_count = 1; /* * Collect the page table pages that were replaced by a 2MB * page in create_pagetables(). They are zero filled. */ if ((vm_paddr_t)i << PDRSHIFT < KERNend && pmap_insert_pt_page(kernel_pmap, mpte, false)) panic("pmap_init: pmap_insert_pt_page failed"); } PMAP_UNLOCK(kernel_pmap); vm_wire_add(nkpt); /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; if ((amd_feature & AMDID_PAGE1GB) != 0) { KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, ("pmap_init: can't assign to pagesizes[2]")); pagesizes[2] = NBPDP; } } /* * Initialize pv chunk lists. */ for (i = 0; i < PMAP_MEMDOM; i++) { mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF); TAILQ_INIT(&pv_chunks[i].pvc_list); } pmap_init_pv_table(); pmap_initialized = 1; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; /* Make the direct map consistent */ if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), ppim->sz, ppim->mode); } if (!bootverbose) continue; printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, ppim->pa, ppim->va, ppim->sz, ppim->mode); } mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, (vmem_addr_t *)&qframe); if (error != 0) panic("qframe allocation failed"); lm_ents = 8; TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); if (lm_ents > LMEPML4I - LMSPML4I + 1) lm_ents = LMEPML4I - LMSPML4I + 1; if (bootverbose) printf("pmap: large map %u PML4 slots (%lu GB)\n", lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); if (lm_ents != 0) { large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); if (large_vmem == NULL) { printf("pmap: cannot create large map\n"); lm_ents = 0; } for (i = 0; i < lm_ents; i++) { m = pmap_large_map_getptp_unlocked(); /* XXXKIB la57 */ kernel_pml4[LMSPML4I + i] = X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | VM_PAGE_TO_PHYS(m); } } } SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &lm_ents, 0, "Maximum number of PML4 entries for use by large map (tunable). " "Each entry corresponds to 512GB of address space."); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "2MB page mapping counters"); static u_long pmap_pde_demotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pde_demotions, 0, "2MB page demotions"); static u_long pmap_pde_mappings; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pde_mappings, 0, "2MB page mappings"); static u_long pmap_pde_p_failures; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pde_p_failures, 0, "2MB page promotion failures"); static u_long pmap_pde_promotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pde_promotions, 0, "2MB page promotions"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "1GB page mapping counters"); static u_long pmap_pdpe_demotions; SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pdpe_demotions, 0, "1GB page demotions"); /*************************************************** * Low level helper routines..... ***************************************************/ static pt_entry_t pmap_swap_pat(pmap_t pmap, pt_entry_t entry) { int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* Verify that both PAT bits are not set at the same time */ KASSERT((entry & x86_pat_bits) != x86_pat_bits, ("Invalid PAT bits in entry %#lx", entry)); /* Swap the PAT bits if one of them is set */ if ((entry & x86_pat_bits) != 0) entry ^= x86_pat_bits; break; case PT_EPT: /* * Nothing to do - the memory attributes are represented * the same way for regular pages and superpages. */ break; default: panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); } return (entry); } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= 0 && mode < PAT_INDEX_SIZE && pat_index[(int)mode] >= 0); } /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (!pmap_is_valid_memattr(pmap, mode)) panic("Unknown caching mode %d\n", mode); switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; break; case PT_EPT: cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); break; default: panic("unsupported pmap type %d", pmap->pm_type); } return (cache_bits); } static int pmap_cache_mask(pmap_t pmap, boolean_t is_pde) { int mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; break; case PT_EPT: mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); break; default: panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); } return (mask); } static int pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) { int pat_flag, pat_idx; pat_idx = 0; switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; if ((pte & pat_flag) != 0) pat_idx |= 0x4; if ((pte & PG_NC_PCD) != 0) pat_idx |= 0x2; if ((pte & PG_NC_PWT) != 0) pat_idx |= 0x1; break; case PT_EPT: if ((pte & EPT_PG_IGNORE_PAT) != 0) panic("EPT PTE %#lx has no PAT memory type", pte); pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3; break; } /* See pmap_init_pat(). */ if (pat_idx == 4) pat_idx = 0; if (pat_idx == 7) pat_idx = 3; return (pat_idx); } bool pmap_ps_enabled(pmap_t pmap) { return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); } static void pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) { switch (pmap->pm_type) { case PT_X86: break; case PT_RVI: case PT_EPT: /* * XXX * This is a little bogus since the generation number is * supposed to be bumped up when a region of the address * space is invalidated in the page tables. * * In this case the old PDE entry is valid but yet we want * to make sure that any mappings using the old entry are * invalidated in the TLB. * * The reason this works as expected is because we rendezvous * "all" host cpus and force any vcpu context to exit as a * side-effect. */ atomic_add_acq_long(&pmap->pm_eptgen, 1); break; default: panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); } pde_store(pde, newpde); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) { pt_entry_t PG_G; if (pmap_type_guest(pmap)) return; KASSERT(pmap->pm_type == PT_X86, ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); PG_G = pmap_global_bit(pmap); if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); else { /* * Promotion: flush every 4KB page mapping from the TLB, * including any global (PG_G) mappings. */ invltlb_glob(); } } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ /* * Interrupt the cpus that are executing in the guest context. * This will force the vcpu to exit and the cached EPT mappings * will be invalidated by the host before the next vmresume. */ static __inline void pmap_invalidate_ept(pmap_t pmap) { int ipinum; sched_pin(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("pmap_invalidate_ept: absurd pm_active")); /* * The TLB mappings associated with a vcpu context are not * flushed each time a different vcpu is chosen to execute. * * This is in contrast with a process's vtop mappings that * are flushed from the TLB on each context switch. * * Therefore we need to do more than just a TLB shootdown on * the active cpus in 'pmap->pm_active'. To do this we keep * track of the number of invalidations performed on this pmap. * * Each vcpu keeps a cache of this counter and compares it * just before a vmresume. If the counter is out-of-date an * invept will be done to flush stale mappings from the TLB. */ atomic_add_acq_long(&pmap->pm_eptgen, 1); /* * Force the vcpu to exit and trap back into the hypervisor. */ ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; ipi_selected(pmap->pm_active, ipinum); sched_unpin(); } static cpuset_t pmap_invalidate_cpu_mask(pmap_t pmap) { return (pmap == kernel_pmap ? all_cpus : pmap->pm_active); } static inline void pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va, const bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { if (pmap->pm_ucr3 != PMAP_NO_CR3 && /* * If we context-switched right after * PCPU_GET(ucr3_load_mask), we could read the * ~CR3_PCID_SAVE mask, which causes us to skip * the code below to invalidate user pages. This * is handled in pmap_activate_sw_pcid_pti() by * clearing pm_gen if ucr3_load_mask is ~CR3_PCID_SAVE. */ PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { /* * Because pm_pcid is recalculated on a * context switch, we must disable switching. * Otherwise, we might use a stale value * below. */ critical_enter(); pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works1) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = va; invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlpg(ucr3, kcr3, va); } critical_exit(); } } else pmap->pm_pcids[cpuid].pm_gen = 0; CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } /* * The fence is between stores to pm_gen and the read of the * pm_active mask. We need to ensure that it is impossible * for us to miss the bit update in pm_active and * simultaneously observe a non-zero pm_gen in * pmap_activate_sw(), otherwise TLB update is missed. * Without the fence, IA32 allows such an outcome. Note that * pm_active is updated by a locked operation, which provides * the reciprocal fence. */ atomic_thread_fence_seq_cst(); } static void pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va) { pmap_invalidate_page_pcid(pmap, va, true); } static void pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va) { pmap_invalidate_page_pcid(pmap, va, false); } static void pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va) { } DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t)) { if (pmap_pcid_enabled) return (invpcid_works ? pmap_invalidate_page_pcid_invpcid : pmap_invalidate_page_pcid_noinvpcid); return (pmap_invalidate_page_nopcid); } static void pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va, vm_offset_t addr2 __unused) { if (pmap == kernel_pmap) { invlpg(va); } else { if (pmap == PCPU_GET(curpmap)) invlpg(va); pmap_invalidate_page_mode(pmap, va); } } void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap, pmap_invalidate_page_curcpu_cb); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) static void pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, const bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { if (pmap->pm_ucr3 != PMAP_NO_CR3 && PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { critical_enter(); pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works1) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = sva; for (; d.addr < eva; d.addr += PAGE_SIZE) invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); } critical_exit(); } } else pmap->pm_pcids[cpuid].pm_gen = 0; CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } /* See the comment in pmap_invalidate_page_pcid(). */ atomic_thread_fence_seq_cst(); } static void pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_range_pcid(pmap, sva, eva, true); } static void pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_range_pcid(pmap, sva, eva, false); } static void pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { } DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t, vm_offset_t)) { if (pmap_pcid_enabled) return (invpcid_works ? pmap_invalidate_range_pcid_invpcid : pmap_invalidate_range_pcid_noinvpcid); return (pmap_invalidate_range_nopcid); } static void pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } else { if (pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } pmap_invalidate_range_mode(pmap, sva, eva); } } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; } if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap, pmap_invalidate_range_curcpu_cb); } static inline void pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3; uint32_t pcid; u_int cpuid, i; if (pmap == kernel_pmap) { if (invpcid_works1) { bzero(&d, sizeof(d)); invpcid(&d, INVPCID_CTXGLOB); } else { invltlb_glob(); } } else { cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { critical_enter(); pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works1) { d.pcid = pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); } else { kcr3 = pmap->pm_cr3 | pcid; load_cr3(kcr3); } if (pmap->pm_ucr3 != PMAP_NO_CR3) PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); critical_exit(); } else pmap->pm_pcids[cpuid].pm_gen = 0; CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } } /* See the comment in pmap_invalidate_page_pcid(). */ atomic_thread_fence_seq_cst(); } static void pmap_invalidate_all_pcid_invpcid(pmap_t pmap) { pmap_invalidate_all_pcid(pmap, true); } static void pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap) { pmap_invalidate_all_pcid(pmap, false); } static void pmap_invalidate_all_nopcid(pmap_t pmap) { if (pmap == kernel_pmap) invltlb_glob(); else if (pmap == PCPU_GET(curpmap)) invltlb(); } DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t)) { if (pmap_pcid_enabled) return (invpcid_works ? pmap_invalidate_all_pcid_invpcid : pmap_invalidate_all_pcid_noinvpcid); return (pmap_invalidate_all_nopcid); } static void pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused, vm_offset_t addr2 __unused) { pmap_invalidate_all_mode(pmap); } void pmap_invalidate_all(pmap_t pmap) { if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap, pmap_invalidate_all_curcpu_cb); } static void pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused, vm_offset_t addr2 __unused) { wbinvd(); } void pmap_invalidate_cache(void) { smp_cache_flush(pmap_invalidate_cache_curcpu_cb); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ pmap_t pmap; vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_action(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pmap_update_pde_store(act->pmap, act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap || pmap_type_guest(pmap)) active = all_cpus; else { active = pmap->pm_active; } if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pmap = pmap; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, smp_no_rendezvous_barrier, pmap_update_pde_action, pmap_update_pde_teardown, &act); } else { pmap_update_pde_store(pmap, pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(pmap, va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, invalidation functions. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { invlpg(va); if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { critical_enter(); pcid = pmap->pm_pcids[0].pm_pcid; if (invpcid_works) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = va; invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlpg(ucr3, kcr3, va); } critical_exit(); } } else if (pmap_pcid_enabled) pmap->pm_pcids[0].pm_gen = 0; } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct invpcid_descr d; vm_offset_t addr; uint64_t kcr3, ucr3; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { critical_enter(); if (invpcid_works) { d.pcid = pmap->pm_pcids[0].pm_pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = sva; for (; d.addr < eva; d.addr += PAGE_SIZE) invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0]. pm_pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0]. pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); } critical_exit(); } } else if (pmap_pcid_enabled) { pmap->pm_pcids[0].pm_gen = 0; } } void pmap_invalidate_all(pmap_t pmap) { struct invpcid_descr d; uint64_t kcr3, ucr3; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap) { if (pmap_pcid_enabled && invpcid_works) { bzero(&d, sizeof(d)); invpcid(&d, INVPCID_CTXGLOB); } else { invltlb_glob(); } } else if (pmap == PCPU_GET(curpmap)) { if (pmap_pcid_enabled) { critical_enter(); if (invpcid_works) { d.pcid = pmap->pm_pcids[0].pm_pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); if (pmap->pm_ucr3 != PMAP_NO_CR3) { d.pcid |= PMAP_PCID_USER_PT; invpcid(&d, INVPCID_CTX); } } else { kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid; if (pmap->pm_ucr3 != PMAP_NO_CR3) { ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[ 0].pm_pcid | PMAP_PCID_USER_PT; pmap_pti_pcid_invalidate(ucr3, kcr3); } else load_cr3(kcr3); } critical_exit(); } else { invltlb(); } } else if (pmap_pcid_enabled) { pmap->pm_pcids[0].pm_gen = 0; } } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { pmap_update_pde_store(pmap, pde, newpde); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) pmap_update_pde_invalidate(pmap, va, newpde); else pmap->pm_pcids[0].pm_gen = 0; } #endif /* !SMP */ static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { /* * When the PDE has PG_PROMOTED set, the 2MB page mapping was created * by a promotion that did not invalidate the 512 4KB page mappings * that might exist in the TLB. Consequently, at this point, the TLB * may hold both 4KB and 2MB page mappings for the address range [va, * va + NBPDR). Therefore, the entire range must be invalidated here. * In contrast, when PG_PROMOTED is clear, the TLB will not hold any * 4KB page mappings for the address range [va, va + NBPDR), and so a * single INVLPG suffices to invalidate the 2MB page mapping from the * TLB. */ if ((pde & PG_PROMOTED) != 0) pmap_invalidate_range(pmap, va, va + NBPDR - 1); else pmap_invalidate_page(pmap, va); } DEFINE_IFUNC(, void, pmap_invalidate_cache_range, (vm_offset_t sva, vm_offset_t eva)) { if ((cpu_feature & CPUID_SS) != 0) return (pmap_invalidate_cache_range_selfsnoop); if ((cpu_feature & CPUID_CLFSH) != 0) return (pmap_force_invalidate_cache_range); return (pmap_invalidate_cache_range_all); } #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) static void pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) { KASSERT((sva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: sva not page-aligned")); KASSERT((eva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: eva not page-aligned")); } static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_cache_range_check_align(sva, eva); } void pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) { sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); /* * XXX: Some CPUs fault, hang, or trash the local APIC * registers if we use CLFLUSH on the local APIC range. The * local APIC is always uncached, so we don't need to flush * for that range anyway. */ if (pmap_kextract(sva) == lapic_paddr) return; if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { /* * Do per-cache line flush. Use a locked * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache * coherence domain. */ atomic_thread_fence_seq_cst(); for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); atomic_thread_fence_seq_cst(); } else { /* * Writes are ordered by CLFLUSH on Intel CPUs. */ if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } } static void pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_cache_range_check_align(sva, eva); pmap_invalidate_cache(); } /* * Remove the specified set of pages from the data and instruction caches. * * In contrast to pmap_invalidate_cache_range(), this function does not * rely on the CPU's self-snoop feature, because it is intended for use * when moving pages into a different cache domain. */ void pmap_invalidate_cache_pages(vm_page_t *pages, int count) { vm_offset_t daddr, eva; int i; bool useclflushopt; useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) pmap_invalidate_cache(); else { if (useclflushopt) atomic_thread_fence_seq_cst(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (i = 0; i < count; i++) { daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); eva = daddr + PAGE_SIZE; for (; daddr < eva; daddr += cpu_clflush_line_size) { if (useclflushopt) clflushopt(daddr); else clflush(daddr); } } if (useclflushopt) atomic_thread_fence_seq_cst(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } } void pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_cache_range_check_align(sva, eva); if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { pmap_force_invalidate_cache_range(sva, eva); return; } /* See comment in pmap_force_invalidate_cache_range(). */ if (pmap_kextract(sva) == lapic_paddr) return; atomic_thread_fence_seq_cst(); for (; sva < eva; sva += cpu_clflush_line_size) clwb(sva); atomic_thread_fence_seq_cst(); } void pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) { pt_entry_t *pte; vm_offset_t vaddr; int error, pte_bits; KASSERT((spa & PAGE_MASK) == 0, ("pmap_flush_cache_phys_range: spa not page-aligned")); KASSERT((epa & PAGE_MASK) == 0, ("pmap_flush_cache_phys_range: epa not page-aligned")); if (spa < dmaplimit) { pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN( dmaplimit, epa))); if (dmaplimit >= epa) return; spa = dmaplimit; } pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW | X86_PG_V; error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); pte = vtopte(vaddr); for (; spa < epa; spa += PAGE_SIZE) { sched_pin(); pte_store(pte, spa | pte_bits); invlpg(vaddr); /* XXXKIB atomic inside flush_cache_range are excessive */ pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); sched_unpin(); } vmem_free(kernel_arena, vaddr, PAGE_SIZE); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte, PG_V; vm_paddr_t pa; pa = 0; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { if ((*pdpe & PG_PS) != 0) pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); else { pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & PG_V) != 0) { if ((*pde & PG_PS) != 0) { pa = (*pde & PG_PS_FRAME) | (va & PDRMASK); } else { pte = pmap_pde_to_pte(pde, va); pa = (*pte & PG_FRAME) | (va & PAGE_MASK); } } } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pdp_entry_t pdpe, *pdpep; pd_entry_t pde, *pdep; pt_entry_t pte, PG_RW, PG_V; vm_page_t m; m = NULL; PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pdpep = pmap_pdpe(pmap, va); if (pdpep == NULL || ((pdpe = *pdpep) & PG_V) == 0) goto out; if ((pdpe & PG_PS) != 0) { if ((pdpe & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) goto out; m = PHYS_TO_VM_PAGE((pdpe & PG_PS_FRAME) | (va & PDPMASK)); goto check_page; } pdep = pmap_pdpe_to_pde(pdpep, va); if (pdep == NULL || ((pde = *pdep) & PG_V) == 0) goto out; if ((pde & PG_PS) != 0) { if ((pde & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) goto out; m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); goto check_page; } pte = *pmap_pde_to_pte(pdep, va); if ((pte & PG_V) == 0 || ((pte & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0)) goto out; m = PHYS_TO_VM_PAGE(pte & PG_FRAME); check_page: if (m != NULL && !vm_page_wire_mapped(m)) m = NULL; out: PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pd_entry_t pde; vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) { pa = pmap_large_map_kextract(va); } else { pde = *vtopde(va); if (pde & PG_PS) { pa = (pde & PG_PS_FRAME) | (va & PDRMASK); } else { /* * Beware of a concurrent promotion that changes the * PDE at this point! For example, vtopte() must not * be used to access the PTE because it would use the * new PDE. It is, however, safe to use the old PDE * because the page table page is preserved by the * promotion. */ pa = *pmap_pde_to_pte(&pde, va); pa = (pa & PG_FRAME) | (va & PAGE_MASK); } } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | pg_nx); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; int cache_bits; pte = vtopte(va); cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | pg_nx | cache_bits); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; int cache_bits; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); pa = VM_PAGE_TO_PHYS(m) | cache_bits; if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { oldpte |= *pte; pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V); } pte++; } if (__predict_false((oldpte & X86_PG_V) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); } /* * Decrements a page table page's reference count, which is used to record the * number of valid page table entries within the page. If the reference count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->ref_count; if (m->ref_count == 0) { _pmap_unwire_ptp(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { pml5_entry_t *pml5; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pd; vm_page_t pdpg, pdppg, pml4pg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= NUPDE + NUPDPE + NUPML4E) { /* PML4 page */ MPASS(pmap_is_la57(pmap)); pml5 = pmap_pml5e(pmap, va); *pml5 = 0; if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) { pml5 = pmap_pml5e_u(pmap, va); *pml5 = 0; } } else if (m->pindex >= NUPDE + NUPDPE) { /* PDP page */ pml4 = pmap_pml4e(pmap, va); *pml4 = 0; if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) { pml4 = pmap_pml4e_u(pmap, va); *pml4 = 0; } } else if (m->pindex >= NUPDE) { /* PD page */ pdp = pmap_pdpe(pmap, va); *pdp = 0; } else { /* PTE page */ pd = pmap_pde(pmap, va); *pd = 0; } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUPDE) { /* We just released a PT, unhold the matching PD */ pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdpg, free); } else if (m->pindex < NUPDE + NUPDPE) { /* We just released a PD, unhold the matching PDP */ pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdppg, free); } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) { /* We just released a PDP, unhold the matching PML4 */ pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pml4pg, free); } /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the reference count. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, va, mpte, free)); } /* * Release a page table page reference after a failed attempt to create a * mapping. */ static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) { struct spglist free; SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, mpte, &free)) { /* * Although "va" was never mapped, paging-structure caches * could nonetheless have entries that refer to the freed * page table pages. Invalidate those entries. */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } } void pmap_pinit0(pmap_t pmap) { struct proc *p; struct thread *td; int i; PMAP_LOCK_INIT(pmap); pmap->pm_pmltop = kernel_pmap->pm_pmltop; pmap->pm_pmltopu = NULL; pmap->pm_cr3 = kernel_pmap->pm_cr3; /* hack to keep pmap_pti_pcid_invalidate() alive */ pmap->pm_ucr3 = PMAP_NO_CR3; pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = pmap_flags; CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1; pmap->pm_pcids[i].pm_gen = 1; } pmap_activate_boot(pmap); td = curthread; if (pti) { p = td->td_proc; PROC_LOCK(p); p->p_md.md_flags |= P_MD_KPTI; PROC_UNLOCK(p); } pmap_thread_init_invl_gen(td); if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } } void pmap_pinit_pml4(vm_page_t pml4pg) { pml4_entry_t *pm_pml4; int i; pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); /* Wire in kernel global address entries. */ for (i = 0; i < NKPML4E; i++) { pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V; } for (i = 0; i < ndmpdpphys; i++) { pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V; } /* install self-referential address mapping entry(s) */ pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; /* install large map entries if configured */ for (i = 0; i < lm_ents; i++) pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i]; } void pmap_pinit_pml5(vm_page_t pml5pg) { pml5_entry_t *pm_pml5; pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); /* * Add pml5 entry at top of KVA pointing to existing pml4 table, * entering all existing kernel mappings into level 5 table. */ pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); /* * Install self-referential address mapping entry. */ pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); } static void pmap_pinit_pml4_pti(vm_page_t pml4pgu) { pml4_entry_t *pm_pml4u; int i; pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu)); for (i = 0; i < NPML4EPG; i++) pm_pml4u[i] = pti_pml4[i]; } static void pmap_pinit_pml5_pti(vm_page_t pml5pgu) { pml5_entry_t *pm_pml5u; pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu)); /* * Add pml5 entry at top of KVA pointing to existing pml4 pti * table, entering all kernel mappings needed for usermode * into level 5 table. */ pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] = pmap_kextract((vm_offset_t)pti_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) { vm_page_t pmltop_pg, pmltop_pgu; vm_paddr_t pmltop_phys; int i; /* * allocate the page directory page */ pmltop_pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg); pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys); CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; } pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ pmap->pm_ucr3 = PMAP_NO_CR3; pmap->pm_pmltopu = NULL; pmap->pm_type = pm_type; if ((pmltop_pg->flags & PG_ZERO) == 0) pagezero(pmap->pm_pmltop); /* * Do not install the host kernel mappings in the nested page * tables. These mappings are meaningless in the guest physical * address space. * Install minimal kernel mappings in PTI case. */ if (pm_type == PT_X86) { pmap->pm_cr3 = pmltop_phys; if (pmap_is_la57(pmap)) pmap_pinit_pml5(pmltop_pg); else pmap_pinit_pml4(pmltop_pg); if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { pmltop_pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(pmltop_pgu)); if (pmap_is_la57(pmap)) pmap_pinit_pml5_pti(pmltop_pgu); else pmap_pinit_pml4_pti(pmltop_pgu); pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu); } if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { rangeset_init(&pmap->pm_pkru, pkru_dup_range, pkru_free_range, pmap, M_NOWAIT); } } pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = flags; pmap->pm_eptgen = 0; return (1); } int pmap_pinit(pmap_t pmap) { return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); } static void pmap_allocpte_free_unref(pmap_t pmap, vm_offset_t va, pt_entry_t *pte) { vm_page_t mpg; struct spglist free; mpg = PHYS_TO_VM_PAGE(*pte & PG_FRAME); if (mpg->ref_count != 0) return; SLIST_INIT(&free); _pmap_unwire_ptp(pmap, va, mpg, &free); pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } static pml4_entry_t * pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, bool addref) { vm_pindex_t pml5index; pml5_entry_t *pml5; pml4_entry_t *pml4; vm_page_t pml4pg; pt_entry_t PG_V; bool allocated; if (!pmap_is_la57(pmap)) return (&pmap->pm_pmltop[pmap_pml4e_index(va)]); PG_V = pmap_valid_bit(pmap); pml5index = pmap_pml5e_index(va); pml5 = &pmap->pm_pmltop[pml5index]; if ((*pml5 & PG_V) == 0) { if (_pmap_allocpte(pmap, pmap_pml5e_pindex(va), lockp, va) == NULL) return (NULL); allocated = true; } else { allocated = false; } pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME); pml4 = &pml4[pmap_pml4e_index(va)]; if ((*pml4 & PG_V) == 0) { pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME); if (allocated && !addref) pml4pg->ref_count--; else if (!allocated && addref) pml4pg->ref_count++; } return (pml4); } static pdp_entry_t * pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, bool addref) { vm_page_t pdppg; pml4_entry_t *pml4; pdp_entry_t *pdp; pt_entry_t PG_V; bool allocated; PG_V = pmap_valid_bit(pmap); pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false); if (pml4 == NULL) return (NULL); if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pdp, recurse */ if (_pmap_allocpte(pmap, pmap_pml4e_pindex(va), lockp, va) == NULL) { if (pmap_is_la57(pmap)) pmap_allocpte_free_unref(pmap, va, pmap_pml5e(pmap, va)); return (NULL); } allocated = true; } else { allocated = false; } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pmap_pdpe_index(va)]; if ((*pdp & PG_V) == 0) { pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); if (allocated && !addref) pdppg->ref_count--; else if (!allocated && addref) pdppg->ref_count++; } return (pdp); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two, three, or four, * up to three pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. * * The ptepindexes, i.e. page indices, of the page table pages encountered * while translating virtual address va are defined as follows: * - for the page table page (last level), * ptepindex = pmap_pde_pindex(va) = va >> PDRSHIFT, * in other words, it is just the index of the PDE that maps the page * table page. * - for the page directory page, * ptepindex = NUPDE (number of userland PD entries) + * (pmap_pde_index(va) >> NPDEPGSHIFT) * i.e. index of PDPE is put after the last index of PDE, * - for the page directory pointer page, * ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT + * NPML4EPGSHIFT), * i.e. index of pml4e is put after the last index of PDPE, * - for the PML4 page (if LA57 mode is enabled), * ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >> * (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT), * i.e. index of pml5e is put after the last index of PML4E. * * Define an order on the paging entries, where all entries of the * same height are put together, then heights are put from deepest to * root. Then ptexpindex is the sequential number of the * corresponding paging entry in this order. * * The values of NUPDE, NUPDPE, and NUPML4E are determined by the size of * LA57 paging structures even in LA48 paging mode. Moreover, the * ptepindexes are calculated as if the paging structures were 5-level * regardless of the actual mode of operation. * * The root page at PML4/PML5 does not participate in this indexing scheme, * since it is statically allocated by pmap_pinit() and not by _pmap_allocpte(). */ static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, vm_offset_t va __unused) { vm_pindex_t pml5index, pml4index; pml5_entry_t *pml5, *pml5u; pml4_entry_t *pml4, *pml4u; pdp_entry_t *pdp; pd_entry_t *pd; vm_page_t m, pdpg; pt_entry_t PG_A, PG_M, PG_RW, PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); PMAP_ASSERT_NOT_IN_DI(); vm_wait(NULL); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= NUPDE + NUPDPE + NUPML4E) { MPASS(pmap_is_la57(pmap)); pml5index = pmap_pml5e_index(va); pml5 = &pmap->pm_pmltop[pml5index]; KASSERT((*pml5 & PG_V) == 0, ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5)); *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) { if (pmap->pm_ucr3 != PMAP_NO_CR3) *pml5 |= pg_nx; pml5u = &pmap->pm_pmltopu[pml5index]; *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } } else if (ptepindex >= NUPDE + NUPDPE) { pml4index = pmap_pml4e_index(va); /* Wire up a new PDPE page */ pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true); if (pml4 == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } KASSERT((*pml4 & PG_V) == 0, ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4)); *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && pml4index < NUPML4E) { /* * PTI: Make all user-space mappings in the * kernel-mode page table no-execute so that * we detect any programming errors that leave * the kernel-mode page table active on return * to user space. */ if (pmap->pm_ucr3 != PMAP_NO_CR3) *pml4 |= pg_nx; pml4u = &pmap->pm_pmltopu[pml4index]; *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } } else if (ptepindex >= NUPDE) { /* Wire up a new PDE page */ pdp = pmap_allocpte_getpdp(pmap, lockp, va, true); if (pdp == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } KASSERT((*pdp & PG_V) == 0, ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp)); *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } else { /* Wire up a new PTE page */ pdp = pmap_allocpte_getpdp(pmap, lockp, va, false); if (pdp == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } if ((*pdp & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, pmap_pdpe_pindex(va), lockp, va) == NULL) { pmap_allocpte_free_unref(pmap, va, pmap_pml4e(pmap, va)); vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { /* Add reference to the pd page */ pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); pdpg->ref_count++; } pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); /* Now we know where the page directory page is */ pd = &pd[pmap_pde_index(va)]; KASSERT((*pd & PG_V) == 0, ("pmap %p va %#lx pd %#lx", pmap, va, *pd)); *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } pmap_resident_count_inc(pmap, 1); return (m); } static pd_entry_t * pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, struct rwlock **lockp) { pdp_entry_t *pdpe, PG_V; pd_entry_t *pde; vm_page_t pdpg; vm_pindex_t pdpindex; PG_V = pmap_valid_bit(pmap); retry: pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { pde = pmap_pdpe_to_pde(pdpe, va); if (va < VM_MAXUSER_ADDRESS) { /* Add a reference to the pd page. */ pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); pdpg->ref_count++; } else pdpg = NULL; } else if (va < VM_MAXUSER_ADDRESS) { /* Allocate a pd page. */ pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT; pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp, va); if (pdpg == NULL) { if (lockp != NULL) goto retry; else return (NULL); } pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); pde = &pde[pmap_pde_index(va)]; } else panic("pmap_alloc_pde: missing page table page for va %#lx", va); *pdpgp = pdpg; return (pde); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pd, PG_V; vm_page_t m; PG_V = pmap_valid_bit(pmap); /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); retry: /* * Get the page directory entry */ pd = pmap_pde(pmap, va); /* * This supports switching from a 2MB page to a * normal 4K page. */ if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { /* * Invalidation of the 2MB page mapping may have caused * the deallocation of the underlying PD page. */ pd = NULL; } } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (pd != NULL && (*pd & PG_V) != 0) { m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); m->ref_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_allocpte(pmap, ptepindex, lockp, va); if (m == NULL && lockp != NULL) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap %p resident count %ld != 0", pmap, pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap %p has reserved page table page(s)", pmap)); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop)); if (pmap_is_la57(pmap)) { pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0; pmap->pm_pmltop[PML5PML5I] = 0; } else { for (i = 0; i < NKPML4E; i++) /* KVA */ pmap->pm_pmltop[KPML4BASE + i] = 0; for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ pmap->pm_pmltop[DMPML4I + i] = 0; pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */ for (i = 0; i < lm_ents; i++) /* Large Map */ pmap->pm_pmltop[LMSPML4I + i] = 0; } vm_page_unwire_noq(m); vm_page_free_zero(m); if (pmap->pm_pmltopu != NULL) { m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap-> pm_pmltopu)); vm_page_unwire_noq(m); vm_page_free(m); } if (pmap->pm_type == PT_X86 && (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) rangeset_fini(&pmap->pm_pkru); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * Allocate physical memory for the vm_page array and map it into KVA, * attempting to back the vm_pages with domain-local memory. */ void pmap_page_array_startup(long pages) { pdp_entry_t *pdpe; pd_entry_t *pde, newpdir; vm_offset_t va, start, end; vm_paddr_t pa; long pfn; int domain, i; vm_page_array_size = pages; start = VM_MIN_KERNEL_ADDRESS; end = start + pages * sizeof(struct vm_page); for (va = start; va < end; va += NBPDR) { pfn = first_page + (va - start) / sizeof(struct vm_page); domain = _vm_phys_domain(ptoa(pfn)); pdpe = pmap_pdpe(kernel_pmap, va); if ((*pdpe & X86_PG_V) == 0) { pa = vm_phys_early_alloc(domain, PAGE_SIZE); dump_add_page(pa); pagezero((void *)PHYS_TO_DMAP(pa)); *pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M); } pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & X86_PG_V) != 0) panic("Unexpected pde"); pa = vm_phys_early_alloc(domain, NBPDR); for (i = 0; i < NPDEPG; i++) dump_add_page(pa + i * PAGE_SIZE); newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS | pg_g | pg_nx); pde_store(pde, newpdir); } vm_page_array = (vm_page_t)start; } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *pde, newpdir; pdp_entry_t *pdpe; mtx_assert(&kernel_map->system_mtx, MA_OWNED); /* * Return if "addr" is within the range of kernel page table pages * that were preallocated during pmap bootstrap. Moreover, leave * "kernel_vm_end" and the kernel page table as they were. * * The correctness of this action is based on the following * argument: vm_map_insert() allocates contiguous ranges of the * kernel virtual address space. It calls this function if a range * ends after "kernel_vm_end". If the kernel is mapped between * "kernel_vm_end" and "addr", then the range cannot begin at * "kernel_vm_end". In fact, its beginning address cannot be less * than the kernel. Thus, there is no immediate need to allocate * any new kernel page table pages between "kernel_vm_end" and * "KERNBASE". */ if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) return; addr = roundup2(addr, NBPDR); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); if ((*pdpe & X86_PG_V) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M); continue; /* try again */ } pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); if ((*pde & X86_PG_V) != 0) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; pde_store(pde, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif static void reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) { if (pmap == NULL) return; pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); if (start_di) pmap_delayed_invl_finish(); } /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) { struct pv_chunks_list *pvc; struct pv_chunk *pc, *pc_marker, *pc_marker_end; struct pv_chunk_header pc_marker_b, pc_marker_end_b; struct md_page *pvh; pd_entry_t *pde; pmap_t next_pmap, pmap; pt_entry_t *pte, tpte; pt_entry_t PG_G, PG_A, PG_M, PG_RW; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed; bool start_di, restart; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; m_pc = NULL; PG_G = PG_A = PG_M = PG_RW = 0; SLIST_INIT(&free); bzero(&pc_marker_b, sizeof(pc_marker_b)); bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); pc_marker = (struct pv_chunk *)&pc_marker_b; pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; /* * A delayed invalidation block should already be active if * pmap_advise() or pmap_remove() called this function by way * of pmap_demote_pde_locked(). */ start_di = pmap_not_in_di(); pvc = &pv_chunks[domain]; mtx_lock(&pvc->pvc_lock); pvc->active_reclaims++; TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && SLIST_EMPTY(&free)) { next_pmap = pc->pc_pmap; if (next_pmap == NULL) { /* * The next chunk is a marker. However, it is * not our marker, so active_reclaims must be * > 1. Consequently, the next_chunk code * will not rotate the pv_chunks list. */ goto next_chunk; } mtx_unlock(&pvc->pvc_lock); /* * A pv_chunk can only be removed from the pc_lru list * when both pc_chunks_mutex is owned and the * corresponding pmap is locked. */ if (pmap != next_pmap) { restart = false; reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); pmap = next_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); if (start_di) pmap_delayed_invl_start(); mtx_lock(&pvc->pvc_lock); restart = true; } else if (pmap != locked_pmap) { if (PMAP_TRYLOCK(pmap)) { if (start_di) pmap_delayed_invl_start(); mtx_lock(&pvc->pvc_lock); restart = true; } else { pmap = NULL; /* pmap is not locked */ mtx_lock(&pvc->pvc_lock); pc = TAILQ_NEXT(pc_marker, pc_lru); if (pc == NULL || pc->pc_pmap != next_pmap) continue; goto next_chunk; } } else if (start_di) pmap_delayed_invl_start(); PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); if (restart) continue; } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfq(inuse); pv = &pc->pc_pventry[field * 64 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_W) != 0) continue; tpte = pte_load_clear(pte); if ((tpte & PG_G) != 0) pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pmap_delayed_invl_page(m); pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, *pde, &free); freed++; } } if (freed == 0) { mtx_lock(&pvc->pvc_lock); goto next_chunk; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && pc->pc_map[2] == PC_FREE2) { PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); mtx_lock(&pvc->pvc_lock); TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); mtx_lock(&pvc->pvc_lock); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; next_chunk: TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); if (pvc->active_reclaims == 1 && pmap != NULL) { /* * Rotate the pv chunks list so that we do not * scan the same pv chunks that could not be * freed (because they contained a wired * and/or superpage mapping) on every * invocation of reclaim_pv_chunk(). */ while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) { MPASS(pc->pc_pmap != NULL); TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); } } } TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); pvc->active_reclaims--; mtx_unlock(&pvc->pvc_lock); reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->ref_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); } static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { vm_page_t m; int i, domain; domain = PCPU_GET(domain); for (i = 0; i < vm_ndomains; i++) { m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); if (m != NULL) break; domain = (domain + 1) % vm_ndomains; } return (m); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk_dequeued(struct pv_chunk *pc) { vm_page_t m; PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } static void free_pv_chunk(struct pv_chunk *pc) { struct pv_chunks_list *pvc; pvc = &pv_chunks[pc_to_domain(pc)]; mtx_lock(&pvc->pvc_lock); TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); mtx_unlock(&pvc->pvc_lock); free_pv_chunk_dequeued(pc); } static void free_pv_chunk_batch(struct pv_chunklist *batch) { struct pv_chunks_list *pvc; struct pv_chunk *pc, *npc; int i; for (i = 0; i < vm_ndomains; i++) { if (TAILQ_EMPTY(&batch[i])) continue; pvc = &pv_chunks[i]; mtx_lock(&pvc->pvc_lock); TAILQ_FOREACH(pc, &batch[i], pc_list) { TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); } mtx_unlock(&pvc->pvc_lock); } for (i = 0; i < vm_ndomains; i++) { TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { free_pv_chunk_dequeued(pc); } } } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { struct pv_chunks_list *pvc; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfq(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; pvc = &pv_chunks[_vm_phys_domain(m->phys_addr)]; mtx_lock(&pvc->pvc_lock); TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); mtx_unlock(&pvc->pvc_lock); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Returns the number of one bits within the given PV chunk map. * * The erratas for Intel processors state that "POPCNT Instruction May * Take Longer to Execute Than Expected". It is believed that the * issue is the spurious dependency on the destination register. * Provide a hint to the register rename logic that the destination * value is overwritten, by clearing it, as suggested in the * optimization manual. It should be cheap for unaffected processors * as well. * * Reference numbers for erratas are * 4th Gen Core: HSD146 * 5th Gen Core: BDM85 * 6th Gen Core: SKL029 */ static int popcnt_pc_map_pq(uint64_t *map) { u_long result, tmp; __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" : "=&r" (result), "=&r" (tmp) : "m" (map[0]), "m" (map[1]), "m" (map[2])); return (result); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pv_chunks_list *pvc; struct pch new_tail[PMAP_MEMDOM]; struct pv_chunk *pc; vm_page_t m; int avail, free, i; bool reclaimed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ for (i = 0; i < PMAP_MEMDOM; i++) TAILQ_INIT(&new_tail[i]); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { #ifndef __POPCNT__ if ((cpu_feature2 & CPUID2_POPCNT) == 0) bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); else #endif free = popcnt_pc_map_pq(pc->pc_map); if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail[pc_to_domain(pc)], pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } for (i = 0; i < vm_ndomains; i++) { if (TAILQ_EMPTY(&new_tail[i])) continue; pvc = &pv_chunks[i]; mtx_lock(&pvc->pvc_lock); TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); mtx_unlock(&pvc->pvc_lock); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining NPTEPG - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); va_last = va + NBPDR - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = bsfq(pc->pc_map[field]); pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); } #if VM_NRESERVLEVEL > 0 /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = pde & PG_PS_FRAME; CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page * mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { struct rwlock *lock; boolean_t rv; lock = NULL; rv = pmap_demote_pde_locked(pmap, pde, va, &lock); if (lock != NULL) rw_wunlock(lock); return (rv); } static void pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused) { #ifdef INVARIANTS #ifdef DIAGNOSTIC pt_entry_t *xpte, *ypte; for (xpte = firstpte; xpte < firstpte + NPTEPG; xpte++, newpte += PAGE_SIZE) { if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) { printf("pmap_demote_pde: xpte %zd and newpte map " "different pages: found %#lx, expected %#lx\n", xpte - firstpte, *xpte, newpte); printf("page table dump\n"); for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++) printf("%zd %#lx\n", ypte - firstpte, *ypte); panic("firstpte"); } } #else KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); #endif #endif } static void pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t oldpde, struct rwlock **lockp) { struct spglist free; vm_offset_t sva; SLIST_INIT(&free); sva = trunc_2mpage(va); pmap_remove_pde(pmap, pde, sva, &free, lockp); if ((oldpde & pmap_global_bit(pmap)) == 0) pmap_invalidate_pde_page(pmap, sva, oldpde); vm_page_free_pages_toq(&free, true); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p", va, pmap); } static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; vm_paddr_t mptepa; vm_page_t mpte; int PG_PTE_CACHE; bool in_kernel; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); PG_PKU_MASK = pmap_pku_mask_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); in_kernel = va >= VM_MAXUSER_ADDRESS; oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); /* * Invalidate the 2MB page mapping and return "failure" if the * mapping was never accessed. */ if ((oldpde & PG_A) == 0) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: a wired mapping is missing PG_A")); pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); return (FALSE); } mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * If the page table page is missing and the mapping * is for a kernel address, the mapping must belong to * the direct map. Page table pages are preallocated * for every other part of the kernel address space, * so the direct map region is the only part of the * kernel address space that must be handled here. */ KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS), ("pmap_demote_pde: No saved mpte for va %#lx", va)); /* * If the 2MB page mapping belongs to the direct map * region of the kernel's address space, then the page * allocation request specifies the highest possible * priority (VM_ALLOC_INTERRUPT). Otherwise, the * priority is normal. */ mpte = vm_page_alloc(NULL, pmap_pde_pindex(va), (in_kernel ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); /* * If the allocation of the new page table page fails, * invalidate the 2MB page mapping and return "failure". */ if (mpte == NULL) { pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); return (FALSE); } if (!in_kernel) { mpte->ref_count = NPTEPG; pmap_resident_count_inc(pmap, 1); } } mptepa = VM_PAGE_TO_PHYS(mpte); firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; newpte = pmap_swap_pat(pmap, newpte); /* * If the page table page is not leftover from an earlier promotion, * initialize it. */ if (mpte->valid == 0) pmap_fill_ptp(firstpte, newpte); pmap_demote_pde_check(firstpte, newpte); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the PDE and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_pde() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldpde & PG_MANAGED) != 0) reserve_pv_entries(pmap, NPTEPG - 1, lockp); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pde_store(pde, newpde); /* * Invalidate a stale recursive mapping of the page table page. */ if (in_kernel) pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); /* * Demote the PV entry. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); atomic_add_long(&pmap_pde_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", va, pmap); return (TRUE); } /* * pmap_remove_kernel_pde: Remove a kernel superpage mapping. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (mpte->valid != 0) pagezero((void *)PHYS_TO_DMAP(mptepa)); /* * Demote the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pde_store(pde, newpde); /* * Invalidate a stale recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; pt_entry_t PG_G, PG_A, PG_M, PG_RW; PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 2mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); if (oldpde & PG_MANAGED) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_delayed_invl_page(m); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pde: pte page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->ref_count == NPTEPG, ("pmap_remove_pde: pte page ref count error")); mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t oldpte, PG_A, PG_M, PG_RW; vm_page_t m; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_delayed_invl_page(m); } return (pmap_unuse_pt(pmap, va, ptepde, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free) { struct rwlock *lock; pt_entry_t *pte, PG_V; PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((*pde & PG_V) == 0) return; pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) return; lock = NULL; pmap_remove_pte(pmap, pte, va, *pde, free, &lock); if (lock != NULL) rw_wunlock(lock); pmap_invalidate_page(pmap, va); } /* * Removes the specified range of addresses from the page table page. */ static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) { pt_entry_t PG_G, *pte; vm_offset_t va; bool anyvalid; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_G = pmap_global_bit(pmap); anyvalid = false; va = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, sva += PAGE_SIZE) { if (*pte == 0) { if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } continue; } if ((*pte & PG_G) == 0) anyvalid = true; else if (va == eva) va = sva; if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { sva += PAGE_SIZE; break; } } if (va != eva) pmap_invalidate_range(pmap, va, sva); return (anyvalid); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_page_t mt; vm_offset_t va_next; pml5_entry_t *pml5e; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t PG_G, PG_V; struct spglist free; int anyvalid; PG_G = pmap_global_bit(pmap); PG_V = pmap_valid_bit(pmap); /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); pmap_delayed_invl_start(); PMAP_LOCK(pmap); pmap_pkru_on_remove(pmap, sva, eva); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { pde = pmap_pde(pmap, sva); if (pde && (*pde & PG_PS) == 0) { pmap_remove_page(pmap, sva, pde, &free); goto out; } } lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; if (pmap_is_la57(pmap)) { pml5e = pmap_pml5e(pmap, sva); if ((*pml5e & PG_V) == 0) { va_next = (sva + NBPML5) & ~PML5MASK; if (va_next < sva) va_next = eva; continue; } pml4e = pmap_pml5e_to_pml4e(pml5e, sva); } else { pml4e = pmap_pml4e(pmap, sva); } if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) continue; if ((*pdpe & PG_PS) != 0) { KASSERT(va_next <= eva, ("partial update of non-transparent 1G mapping " "pdpe %#lx sva %#lx eva %#lx va_next %#lx", *pdpe, sva, eva, va_next)); MPASS(pmap != kernel_pmap); /* XXXKIB */ MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); anyvalid = 1; *pdpe = 0; pmap_resident_count_dec(pmap, NBPDP / PAGE_SIZE); mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME); pmap_unwire_ptp(pmap, sva, mt, &free); continue; } /* * Calculate index for next page table. */ va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); ptpaddr = *pde; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, pde, sva, &free, &lock); continue; } else if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { /* The large page mapping was destroyed. */ continue; } else ptpaddr = *pde; } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) anyvalid = 1; } if (lock != NULL) rw_wunlock(lock); out: if (anyvalid) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_delayed_invl_finish(); vm_page_free_pages_toq(&free, true); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; pd_entry_t *pde; vm_offset_t va; struct spglist free; int pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry: rw_wlock(lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pde = pmap_pde(pmap, va); (void)pmap_demote_pde_locked(pmap, pde, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pmap_resident_count_dec(pmap, 1); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); pmap_delayed_invl_wait(m); vm_page_free_pages_toq(&free, true); } /* * pmap_protect_pde: do the things to protect a 2mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_page_t m, mt; boolean_t anychanged; pt_entry_t PG_G, PG_M, PG_RW; PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 2mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if ((prot & VM_PROT_WRITE) == 0) { if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } newpde &= ~(PG_RW | PG_M); } if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; if (newpde != oldpde) { /* * As an optimization to future operations on this PDE, clear * PG_PROMOTED. The impending invalidation will remove any * lingering 4KB page mappings from the TLB. */ if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) goto retry; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_page_t m; vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; pt_entry_t obits, pbits; boolean_t anychanged; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); anychanged = FALSE; /* * Although this function delays and batches the invalidation * of stale TLB entries, it does not need to call * pmap_delayed_invl_start() and * pmap_delayed_invl_finish(), because it does not * ordinarily destroy mappings. Stale TLB entries from * protection-only changes need only be invalidated before the * pmap lock is released, because protection-only changes do * not destroy PV entries. Even operations that iterate over * a physical page's PV list of mappings, like * pmap_remove_write(), acquire the pmap lock for each * mapping. Consequently, for protection-only changes, the * pmap lock suffices to synchronize both page table and TLB * updates. * * This function only destroys a mapping if pmap_demote_pde() * fails. In that case, stale TLB entries are immediately * invalidated. */ PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if (pml4e == NULL || (*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) continue; if ((*pdpe & PG_PS) != 0) { KASSERT(va_next <= eva, ("partial update of non-transparent 1G mapping " "pdpe %#lx sva %#lx eva %#lx va_next %#lx", *pdpe, sva, eva, va_next)); retry_pdpe: obits = pbits = *pdpe; MPASS((pbits & (PG_MANAGED | PG_G)) == 0); MPASS(pmap != kernel_pmap); /* XXXKIB */ if ((prot & VM_PROT_WRITE) == 0) pbits &= ~(PG_RW | PG_M); if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; if (pbits != obits) { if (!atomic_cmpset_long(pdpe, obits, pbits)) /* PG_PS cannot be cleared under us, */ goto retry_pdpe; anychanged = TRUE; } continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); ptpaddr = *pde; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, pde, sva, prot)) anychanged = TRUE; continue; } else if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } } if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { retry: obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; if (pbits != obits) { if (!atomic_cmpset_long(pte, obits, pbits)) goto retry; if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } #if VM_NRESERVLEVEL > 0 static bool pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde) { if (pmap->pm_type != PT_EPT) return (false); return ((pde & EPT_PG_EXECUTE) != 0); } /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single page table page (PTP) to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK; vm_page_t mpte; int PG_PTE_CACHE; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PG_PKU_MASK = pmap_pku_mask_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2MB page. */ firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V) || !pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, newpde))) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK), pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == pmap_pde_pindex(va), ("pmap_promote_pde: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte, true)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); /* * Propagate the PAT index to its proper position. */ newpde = pmap_swap_pat(pmap, newpde); /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else pde_store(pde, PG_PROMOTED | PG_PS | newpde); atomic_add_long(&pmap_pde_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" " in pmap %p", va, pmap); } #endif /* VM_NRESERVLEVEL > 0 */ static int pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, int psind) { vm_page_t mp; pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(psind > 0 && psind < MAXPAGESIZES && pagesizes[psind] != 0, ("psind %d unexpected", psind)); KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0, ("unaligned phys address %#lx newpte %#lx psind %d", newpte & PG_FRAME, newpte, psind)); KASSERT((va & (pagesizes[psind] - 1)) == 0, ("unaligned va %#lx psind %d", va, psind)); KASSERT(va < VM_MAXUSER_ADDRESS, ("kernel mode non-transparent superpage")); /* XXXKIB */ KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS, ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */ PG_V = pmap_valid_bit(pmap); restart: if (!pmap_pkru_same(pmap, va, va + pagesizes[psind])) return (KERN_PROTECTION_FAILURE); pten = newpte; if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) pten |= pmap_pkru_get(pmap, va); if (psind == 2) { /* 1G */ pml4e = pmap_pml4e(pmap, va); if (pml4e == NULL || (*pml4e & PG_V) == 0) { mp = _pmap_allocpte(pmap, pmap_pml4e_pindex(va), NULL, va); if (mp == NULL) goto allocf; pdpe = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); pdpe = &pdpe[pmap_pdpe_index(va)]; origpte = *pdpe; MPASS(origpte == 0); } else { pdpe = pmap_pml4e_to_pdpe(pml4e, va); KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va)); origpte = *pdpe; if ((origpte & PG_V) == 0) { mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); mp->ref_count++; } } *pdpe = pten; } else /* (psind == 1) */ { /* 2M */ pde = pmap_pde(pmap, va); if (pde == NULL) { mp = _pmap_allocpte(pmap, pmap_pdpe_pindex(va), NULL, va); if (mp == NULL) goto allocf; pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); pde = &pde[pmap_pde_index(va)]; origpte = *pde; MPASS(origpte == 0); } else { origpte = *pde; if ((origpte & PG_V) == 0) { pdpe = pmap_pdpe(pmap, va); MPASS(pdpe != NULL && (*pdpe & PG_V) != 0); mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); mp->ref_count++; } } *pde = pten; } KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 && (origpte & PG_PS_FRAME) == (pten & PG_PS_FRAME)), ("va %#lx changing %s phys page origpte %#lx pten %#lx", va, psind == 2 ? "1G" : "2M", origpte, pten)); if ((pten & PG_W) != 0 && (origpte & PG_W) == 0) pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; else if ((pten & PG_W) == 0 && (origpte & PG_W) != 0) pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; if ((origpte & PG_V) == 0) pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE); return (KERN_SUCCESS); allocf: if ((flags & PMAP_ENTER_NOSLEEP) != 0) return (KERN_RESOURCE_SHORTAGE); PMAP_UNLOCK(pmap); vm_wait(NULL); PMAP_LOCK(pmap); goto restart; } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. * * When destroying both a page table and PV entry, this function * performs the TLB invalidation before releasing the PV list * lock, so we do not need pmap_delayed_invl_page() calls here. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pd_entry_t *pde; pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; int rv; boolean_t nosleep; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = trunc_page(va); KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va)); KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); KASSERT((flags & PMAP_ENTER_RESERVED) == 0, ("pmap_enter: flags %u has reserved bits set", flags)); pa = VM_PAGE_TO_PHYS(m); newpte = (pt_entry_t)(pa | PG_A | PG_V); if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if ((prot & VM_PROT_WRITE) != 0) newpte |= PG_RW; KASSERT((newpte & (PG_M | PG_RW)) != PG_M, ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= PG_G; newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); /* * Set modified bit gratuitously for writeable mappings if * the page is unmanaged. We do not want to take a fault * to do the dirty bit accounting for these mappings. */ if ((m->oflags & VPO_UNMANAGED) != 0) { if ((newpte & PG_RW) != 0) newpte |= PG_M; } else newpte |= PG_MANAGED; lock = NULL; PMAP_LOCK(pmap); if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed largepage va %#lx flags %#x", va, flags)); rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags, psind); goto out; } if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); goto out; } mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ retry: pde = pmap_pde(pmap, va); if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || pmap_demote_pde_locked(pmap, pde, va, &lock))) { pte = pmap_pde_to_pte(pde, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); mpte->ref_count++; } } else if (va < VM_MAXUSER_ADDRESS) { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), nosleep ? NULL : &lock, va); if (mpte == NULL && nosleep) { rv = KERN_RESOURCE_SHORTAGE; goto out; } goto retry; } else panic("pmap_enter: invalid page directory va=%#lx", va); origpte = *pte; pv = NULL; if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) newpte |= pmap_pkru_get(pmap, va); /* * Is the specified virtual address already mapped? */ if ((origpte & PG_V) != 0) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) pmap->pm_stats.wired_count++; else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->ref_count--; KASSERT(mpte->ref_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ opa = origpte & PG_FRAME; if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((origpte & PG_MANAGED) != 0 && (newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) goto unchanged; goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. This ensures that all threads sharing the * pmap keep a consistent view of the mapping, which is * necessary for the correct handling of COW faults. It * also permits reuse of the old mapping's PV entry, * avoiding an allocation. * * For consistency, handle unmanaged mappings the same way. */ origpte = pte_load_clear(pte); KASSERT((origpte & PG_FRAME) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((origpte & PG_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(om); if ((origpte & PG_A) != 0) { pmap_invalidate_page(pmap, va); vm_page_aflag_set(om, PGA_REFERENCED); } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); KASSERT(pv != NULL, ("pmap_enter: no PV entry for %#lx", va)); if ((newpte & PG_MANAGED) == 0) free_pv_entry(pmap, pv); if ((om->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } else { /* * Since this mapping is unmanaged, assume that PG_A * is set. */ pmap_invalidate_page(pmap, va); } origpte = 0; } else { /* * Increment the counters. */ if ((newpte & PG_W) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((newpte & PG_MANAGED) != 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the PTE. */ if ((origpte & PG_V) != 0) { validate: origpte = pte_load_store(pte, newpte); KASSERT((origpte & PG_FRAME) == pa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(m); /* * Although the PTE may still have PG_RW set, TLB * invalidation may nonetheless be required because * the PTE no longer has PG_M set. */ } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { /* * This PTE change does not require TLB invalidation. */ goto unchanged; } if ((origpte & PG_A) != 0) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte); unchanged: #if VM_NRESERVLEVEL > 0 /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->ref_count == NPTEPG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va, &lock); #endif rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t newpde; pt_entry_t PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_V = pmap_valid_bit(pmap); newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) newpde |= PG_MANAGED; if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Returns true if every page table entry in the specified page table page is * zero. */ static bool pmap_every_pte_zero(vm_paddr_t pa) { pt_entry_t *pt_end, *pte; KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); pte = (pt_entry_t *)PHYS_TO_DMAP(pa); for (pt_end = pte + NPTEPG; pte < pt_end; pte++) { if (*pte != 0) return (false); } return (true); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pd_entry_t oldpde, *pde; pt_entry_t PG_G, PG_RW, PG_V; vm_page_t mt, pdpg; KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0, ("pmap_enter_pde: cannot create wired user mapping")); PG_G = pmap_global_bit(pmap); PG_RW = pmap_rw_bit(pmap); KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, ("pmap_enter_pde: newpde is missing PG_M")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, newpde))) { CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx" " in pmap %p", va, pmap); return (KERN_FAILURE); } if ((pde = pmap_alloc_pde(pmap, va, &pdpg, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } /* * If pkru is not same for the whole pde range, return failure * and let vm_fault() cope. Check after pde allocation, since * it could sleep. */ if (!pmap_pkru_same(pmap, va, va + NBPDR)) { pmap_abort_ptp(pmap, va, pdpg); return (KERN_FAILURE); } if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) { newpde &= ~X86_PG_PKU_MASK; newpde |= pmap_pkru_get(pmap, va); } /* * If there are existing mappings, either abort or remove them. */ oldpde = *pde; if ((oldpde & PG_V) != 0) { KASSERT(pdpg == NULL || pdpg->ref_count > 1, ("pmap_enter_pde: pdpg's reference count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0 && (va < VM_MAXUSER_ADDRESS || (oldpde & PG_PS) != 0 || !pmap_every_pte_zero(oldpde & PG_FRAME))) { if (pdpg != NULL) pdpg->ref_count--; CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_FAILURE); } /* Break the existing mapping(s). */ SLIST_INIT(&free); if ((oldpde & PG_PS) != 0) { /* * The reference to the PD page that was acquired by * pmap_alloc_pde() ensures that it won't be freed. * However, if the PDE resulted from a promotion, then * a reserved PT page could be freed. */ (void)pmap_remove_pde(pmap, pde, va, &free, lockp); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, va, oldpde); } else { pmap_delayed_invl_start(); if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, lockp)) pmap_invalidate_all(pmap); pmap_delayed_invl_finish(); } if (va < VM_MAXUSER_ADDRESS) { vm_page_free_pages_toq(&free, true); KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", pde)); } else { KASSERT(SLIST_EMPTY(&free), ("pmap_enter_pde: freed kernel page table page")); /* * Both pmap_remove_pde() and pmap_remove_ptes() will * leave the kernel page table page zero filled. */ mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_pde: trie insert failed"); } } if ((newpde & PG_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { if (pdpg != NULL) pmap_abort_ptp(pmap, va, pdpg); CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((newpde & PG_RW) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } } /* * Increment counters. */ if ((newpde & PG_W) != 0) pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); /* * Map the superpage. (This is not a promoted mapping; there will not * be any lingering 4KB page mappings in the TLB.) */ pde_store(pde, newpde); atomic_add_long(&pmap_pde_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_allow_2m_x_page(pmap, (prot & VM_PROT_EXECUTE) != 0) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { pt_entry_t newpte, *pte, PG_V; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t ptepindex; pd_entry_t *ptepa; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->ref_count++; } else { /* * Get the page directory entry */ ptepa = pmap_pde(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (ptepa && (*ptepa & PG_V) != 0) { if (*ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); mpte->ref_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_allocpte(pmap, ptepindex, NULL, va); if (mpte == NULL) return (mpte); } } pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); pte = &pte[pmap_pte_index(va)]; } else { mpte = NULL; pte = vtopte(va); } if (*pte) { if (mpte != NULL) mpte->ref_count--; return (NULL); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) pmap_abort_ptp(pmap, va, mpte); return (NULL); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); newpte = VM_PAGE_TO_PHYS(m) | PG_V | pmap_cache_bits(pmap, m->md.pat_mode, 0); if ((m->oflags & VPO_UNMANAGED) == 0) newpte |= PG_MANAGED; if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U | pmap_pkru_get(pmap, va); pte_store(pte, newpte); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; pt_entry_t PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa, ptepa; vm_page_t p, pdpg; int pat_mode; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!pmap_ps_enabled(pmap)) return; if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2MB pages. Since "ptepa" is 2M aligned and * "size" is a multiple of 2M, adding the PAT setting to "pa" * will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pde = pmap_alloc_pde(pmap, addr, &pdpg, NULL); if (pde == NULL) { /* * The creation of mappings below is only an * optimization. If a page directory page * cannot be allocated without blocking, * continue on to the next mapping rather than * blocking. */ addr += NBPDR; continue; } if ((*pde & PG_V) == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); atomic_add_long(&pmap_pde_mappings, 1); } else { /* Continue on if the PDE is already valid. */ pdpg->ref_count--; KASSERT(pdpg->ref_count > 0, ("pmap_object_init_pt: missing reference " "to page directory page, va: 0x%lx", addr)); } addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware * feature, so there is no need to invalidate any TLB entries. * Since pmap_demote_pde() for the wired entry must never fail, * pmap_delayed_invl_start()/finish() calls around the * function are not needed. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte, PG_V, PG_G; PG_V = pmap_valid_bit(pmap); PG_G = pmap_global_bit(pmap); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if (pml4e == NULL || (*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) continue; if ((*pdpe & PG_PS) != 0) { KASSERT(va_next <= eva, ("partial update of non-transparent 1G mapping " "pdpe %#lx sva %#lx eva %#lx va_next %#lx", *pdpe, sva, eva, va_next)); MPASS(pmap != kernel_pmap); /* XXXKIB */ MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); atomic_clear_long(pdpe, PG_W); pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { atomic_clear_long(pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ atomic_clear_long(pte, PG_W); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct rwlock *lock; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde, srcptepaddr; pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte; vm_offset_t addr, end_addr, va_next; vm_page_t dst_pdpg, dstmpte, srcmpte; if (dst_addr != src_addr) return; if (dst_pmap->pm_type != src_pmap->pm_type) return; /* * EPT page table entries that require emulation of A/D bits are * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit * (aka EPT_PG_EXECUTE) could still be set. Since some EPT * implementations flag an EPT misconfiguration for exec-only * mappings we skip this function entirely for emulated pmaps. */ if (pmap_emulate_ad_bits(dst_pmap)) return; end_addr = src_addr + len; lock = NULL; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } PG_A = pmap_accessed_bit(dst_pmap); PG_M = pmap_modified_bit(dst_pmap); PG_V = pmap_valid_bit(dst_pmap); for (addr = src_addr; addr < end_addr; addr = va_next) { KASSERT(addr < UPT_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy page tables")); pml4e = pmap_pml4e(src_pmap, addr); if (pml4e == NULL || (*pml4e & PG_V) == 0) { va_next = (addr + NBPML4) & ~PML4MASK; if (va_next < addr) va_next = end_addr; continue; } va_next = (addr + NBPDP) & ~PDPMASK; if (va_next < addr) va_next = end_addr; pdpe = pmap_pml4e_to_pdpe(pml4e, addr); if ((*pdpe & PG_V) == 0) continue; if ((*pdpe & PG_PS) != 0) { KASSERT(va_next <= end_addr, ("partial update of non-transparent 1G mapping " "pdpe %#lx sva %#lx eva %#lx va_next %#lx", *pdpe, addr, end_addr, va_next)); MPASS((addr & PDPMASK) == 0); MPASS((*pdpe & PG_MANAGED) == 0); srcptepaddr = *pdpe; pdpe = pmap_pdpe(dst_pmap, addr); if (pdpe == NULL) { if (_pmap_allocpte(dst_pmap, pmap_pml4e_pindex(addr), NULL, addr) == NULL) break; pdpe = pmap_pdpe(dst_pmap, addr); } else { pml4e = pmap_pml4e(dst_pmap, addr); dst_pdpg = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); dst_pdpg->ref_count++; } KASSERT(*pdpe == 0, ("1G mapping present in dst pmap " "pdpe %#lx sva %#lx eva %#lx va_next %#lx", *pdpe, addr, end_addr, va_next)); *pdpe = srcptepaddr & ~PG_W; pmap_resident_count_inc(dst_pmap, NBPDP / PAGE_SIZE); continue; } va_next = (addr + NBPDR) & ~PDRMASK; if (va_next < addr) va_next = end_addr; pde = pmap_pdpe_to_pde(pdpe, addr); srcptepaddr = *pde; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; pde = pmap_alloc_pde(dst_pmap, addr, &dst_pdpg, NULL); if (pde == NULL) break; if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM, &lock))) { *pde = srcptepaddr & ~PG_W; pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); atomic_add_long(&pmap_pde_mappings, 1); } else pmap_abort_ptp(dst_pmap, addr, dst_pdpg); continue; } srcptepaddr &= PG_FRAME; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); KASSERT(srcmpte->ref_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); src_pte = &src_pte[pmap_pte_index(addr)]; dstmpte = NULL; for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { ptetemp = *src_pte; /* * We only virtual copy managed pages. */ if ((ptetemp & PG_MANAGED) == 0) continue; if (dstmpte != NULL) { KASSERT(dstmpte->pindex == pmap_pde_pindex(addr), ("dstmpte pindex/addr mismatch")); dstmpte->ref_count++; } else if ((dstmpte = pmap_allocpte(dst_pmap, addr, NULL)) == NULL) goto out; dst_pte = (pt_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); dst_pte = &dst_pte[pmap_pte_index(addr)]; if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { /* * Clear the wired, modified, and accessed * (referenced) bits during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); pmap_resident_count_inc(dst_pmap, 1); } else { pmap_abort_ptp(dst_pmap, addr, dstmpte); goto out; } /* Have we copied all of the valid mappings? */ if (dstmpte->ref_count >= srcmpte->ref_count) break; } } out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } int pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) { int error; if (dst_pmap->pm_type != src_pmap->pm_type || dst_pmap->pm_type != PT_X86 || (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) return (0); for (;;) { if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } error = pmap_pkru_copy(dst_pmap, src_pmap); /* Clean up partial copy on failure due to no memory. */ if (error == ENOMEM) pmap_pkru_deassign_all(dst_pmap); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); if (error != ENOMEM) break; vm_wait(NULL); } return (error); } /* * Zero the specified hardware page. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * Zero an an area within a single hardware page. off and size must not * cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * Copy 1 specified hardware page to another. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t pages[2]; vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; int cnt; boolean_t mapped; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; pages[0] = ma[a_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; pages[1] = mb[b_offset >> PAGE_SHIFT]; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); a_cp = (char *)vaddr[0] + a_pg_offset; b_cp = (char *)vaddr[1] + b_pg_offset; bcopy(a_cp, b_cp, cnt); if (__predict_false(mapped)) pmap_unmap_io_transient(pages, vaddr, 2, FALSE); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pde(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 2mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { struct rwlock *lock; boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_runlock(lock); return (rv); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. * * Although this function destroys all of the pmap's managed, * non-wired mappings, it can delay and batch the invalidation of TLB * entries without calling pmap_delayed_invl_start() and * pmap_delayed_invl_finish(). Because the pmap is not active on * any other processor, none of these TLB entries will ever be used * before their eventual invalidation. Consequently, there is no need * for either pmap_remove_all() or pmap_remove_write() to wait for * that eventual TLB invalidation. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t ptepde; pt_entry_t *pte, tpte; pt_entry_t PG_M, PG_RW, PG_V; struct spglist free; struct pv_chunklist free_chunks[PMAP_MEMDOM]; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, i, idx; boolean_t superpage; vm_paddr_t pa; /* * Assert that the given pmap is only active on the current * CPU. Unfortunately, we cannot block another CPU from * activating the pmap while this function is executing. */ KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); #ifdef INVARIANTS { cpuset_t other_cpus; other_cpus = all_cpus; critical_enter(); CPU_CLR(PCPU_GET(cpuid), &other_cpus); CPU_AND(&other_cpus, &pmap->pm_active); critical_exit(); KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); } #endif lock = NULL; PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); for (i = 0; i < PMAP_MEMDOM; i++) TAILQ_INIT(&free_chunks[i]); SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfq(inuse); bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pdpe(pmap, pv->pv_va); ptepde = *pte; pte = pmap_pdpe_to_pde(pte, pv->pv_va); tpte = *pte; if ((tpte & (PG_PS | PG_V)) == PG_V) { superpage = FALSE; ptepde = tpte; pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & PG_FRAME); pte = &pte[pmap_pte_index(pv->pv_va)]; tpte = *pte; } else { /* * Keep track whether 'tpte' is a * superpage explicitly instead of * relying on PG_PS being set. * * This is because PG_PS is numerically * identical to PG_PTE_PAT and thus a * regular page could be mistaken for * a superpage. */ superpage = TRUE; } if ((tpte & PG_V) == 0) { panic("bad pte va %lx pte %lx", pv->pv_va, tpte); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } if (superpage) pa = tpte & PG_PS_FRAME; else pa = tpte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (superpage) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; if (superpage) { pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if ((mt->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: pte page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->ref_count == NPTEPG, ("pmap_remove_pages: pte page reference count error")); mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); } } else { pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((m->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list); } } if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); pmap_pkru_deassign_all(pmap); free_pv_chunk_batch((struct pv_chunklist *)&free_chunks); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask; pt_entry_t PG_A, PG_M, PG_RW, PG_V; pmap_t pmap; int md_gen, pvh_gen; boolean_t rv; rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); mask = 0; if (modified) { PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); mask |= PG_RW | PG_M; } if (accessed) { PG_A = pmap_accessed_bit(pmap); PG_V = pmap_valid_bit(pmap); mask |= PG_V | PG_A; } rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pde(pmap, pv->pv_va); mask = 0; if (modified) { PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); mask |= PG_RW | PG_M; } if (accessed) { PG_A = pmap_accessed_bit(pmap); PG_V = pmap_valid_bit(pmap); mask |= PG_V | PG_A; } rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte, PG_V; boolean_t rv; PG_V = pmap_valid_bit(pmap); rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { pte = pmap_pde_to_pte(pde, addr); rv = (*pte & PG_V) == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pd_entry_t *pde; pt_entry_t oldpte, *pte, PG_M, PG_RW; vm_offset_t va; int pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde_locked(pmap, pde, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_write: found a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); retry: oldpte = *pte; if (oldpte & PG_RW) { if (!atomic_cmpset_long(pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_delayed_invl_wait(m); } static __inline boolean_t safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) { if (!pmap_emulate_ad_bits(pmap)) return (TRUE); KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); /* * XWR = 010 or 110 will cause an unconditional EPT misconfiguration * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared * if the EPT_PG_WRITE bit is set. */ if ((pte & EPT_PG_WRITE) != 0) return (FALSE); /* * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. */ if ((pte & EPT_PG_EXECUTE) == 0 || ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) return (TRUE); else return (FALSE); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). * * A DI block is not needed within this function, because * invalidations are performed before the PV list lock is * released. */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A, PG_M, PG_RW; vm_offset_t va; vm_paddr_t pa; int cleared, md_gen, not_cleared, pvh_gen; struct spglist free; boolean_t demoted; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, pv->pv_va); oldpde = *pde; if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "oldpde" is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((oldpde & PG_A) != 0) { /* * Since this reference bit is shared by 512 4KB * pages, it should not be cleared every time it is * tested. Apply a simple "hash" function on the * physical page number, the virtual superpage number, * and the pmap address to select one 4KB page out of * the 512 on which testing the reference bit will * result in clearing that reference bit. This * function is designed to avoid the selection of the * same 4KB page for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (oldpde & PG_W) == 0) { if (safe_to_clear_referenced(pmap, oldpde)) { atomic_clear_long(pde, PG_A); pmap_invalidate_page(pmap, pv->pv_va); demoted = FALSE; } else if (pmap_demote_pde_locked(pmap, pde, pv->pv_va, &lock)) { /* * Remove the mapping to a single page * so that a subsequent access may * repromote. Since the underlying * page table page is fully populated, * this removal never frees a page * table page. */ demoted = TRUE; va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pde_to_pte(pde, va); pmap_remove_pte(pmap, pte, va, *pde, NULL, &lock); pmap_invalidate_page(pmap, va); } else demoted = TRUE; if (demoted) { /* * The superpage mapping was removed * entirely and therefore 'pv' is no * longer valid. */ if (pvf == pv) pvf = NULL; pv = NULL; } cleared++; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { if (safe_to_clear_referenced(pmap, *pte)) { atomic_clear_long(pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else if ((*pte & PG_W) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for * them is wasted effort. We do the * hard work for unwired pages only. */ pmap_remove_pte(pmap, pte, pv->pv_va, *pde, &free, &lock); pmap_invalidate_page(pmap, pv->pv_va); cleared++; if (pvf == pv) pvf = NULL; pv = NULL; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); vm_page_free_pages_toq(&free, true); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { struct rwlock *lock; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; vm_offset_t va, va_next; vm_page_t m; bool anychanged; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; /* * A/D bit emulation requires an alternate code path when clearing * the modified and accessed bits below. Since this function is * advisory in nature we skip it entirely for pmaps that require * A/D bit emulation. */ if (pmap_emulate_ad_bits(pmap)) return; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); anychanged = false; pmap_delayed_invl_start(); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if (pml4e == NULL || (*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) continue; if ((*pdpe & PG_PS) != 0) { KASSERT(va_next <= eva, ("partial update of non-transparent 1G mapping " "pdpe %#lx sva %#lx eva %#lx va_next %#lx", *pdpe, sva, eva, va_next)); continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; lock = NULL; if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { if (lock != NULL) rw_wunlock(lock); /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Choosing the last page * within the address range [sva, min(va_next, eva)) * generally results in more repromotions. Since the * underlying page table page is fully populated, this * removal never frees a page table page. */ if ((oldpde & PG_W) == 0) { va = eva; if (va > va_next) va = va_next; va -= PAGE_SIZE; KASSERT(va >= sva, ("pmap_advise: no address gap")); pte = pmap_pde_to_pte(pde, va); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, va, *pde, NULL, &lock); anychanged = true; } if (lock != NULL) rw_wunlock(lock); } if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_long(pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_long(pte, PG_A); else goto maybe_invlrng; if ((*pte & PG_G) != 0) { if (va == va_next) va = sva; } else anychanged = true; continue; maybe_invlrng: if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_delayed_invl_finish(); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_M, PG_RW; struct rwlock *lock; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); oldpde = *pde; /* If oldpde has PG_RW set, then it also has PG_M set. */ if ((oldpde & PG_RW) != 0 && pmap_demote_pde_locked(pmap, pde, va, &lock) && (oldpde & PG_W) == 0) { /* * Write protect the mapping to a single page so that * a subsequent write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pde_to_pte(pde, va); atomic_clear_long(pte, PG_M | PG_RW); vm_page_dirty(m); pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { atomic_clear_long(pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); } /* * Miscellaneous support routines follow */ /* Adjust the properties for a leaf page table entry. */ static __inline void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask) { u_long opte, npte; opte = *(u_long *)pte; do { npte = opte & ~mask; npte |= bits; } while (npte != opte && !atomic_fcmpset_long((u_long *)pte, &opte, npte)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ static void * pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = trunc_page(pa); if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && (ppim->mode == mode || (flags & MAPDEV_SETATTR) == 0)) return ((void *)(ppim->va + offset)); } /* * If the specified range of physical addresses fits within * the direct map window, use the direct map. */ if (pa < dmaplimit && pa + size <= dmaplimit) { va = PHYS_TO_DMAP(pa); if ((flags & MAPDEV_SETATTR) != 0) { PMAP_LOCK(kernel_pmap); i = pmap_change_props_locked(va, size, PROT_NONE, mode, flags); PMAP_UNLOCK(kernel_pmap); } else i = 0; if (!i) return ((void *)(va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); if ((flags & MAPDEV_FLUSHCACHE) != 0) pmap_invalidate_cache_range(va, va + tmpsize); return ((void *)(va + offset)); } void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE | MAPDEV_SETATTR)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); } void * pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, MAPDEV_SETATTR)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, MAPDEV_FLUSHCACHE)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; /* If we gave a direct map region in pmap_mapdev, do nothing */ if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) { pmap_qremove(va, atop(size)); kva_free(va, size); } } /* * Tries to demote a 1GB page mapping. */ static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) { pdp_entry_t newpdpe, oldpdpe; pd_entry_t *firstpde, newpde, *pde; pt_entry_t PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pdpgpa; vm_page_t pdpg; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpdpe = *pdpe; KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } pdpgpa = VM_PAGE_TO_PHYS(pdpg); firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; KASSERT((oldpdpe & PG_A) != 0, ("pmap_demote_pdpe: oldpdpe is missing PG_A")); KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pdpe: oldpdpe is missing PG_M")); newpde = oldpdpe; /* * Initialize the page directory page. */ for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { *pde = newpde; newpde += NBPDR; } /* * Demote the mapping. */ *pdpe = newpdpe; /* * Invalidate a stale recursive mapping of the page directory page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); pmap_pdpe_demotions++; CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.pat_mode)) panic("memory attribute change on the direct map failed"); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the direct map or the kernel map. If * the virtual address range is contained within the kernel map, then the * memory type for each of the corresponding ranges of the direct map is also * changed. (The corresponding ranges of the direct map are those ranges that * map the same physical pages as the specified virtual address range.) These * changes to the direct map are necessary because Intel describes the * behavior of their processors as "undefined" if two or more mappings to the * same physical page have different memory types. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. In the * latter case, the memory type may have been changed on some part of the * virtual address range or the direct map. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { int error; PMAP_LOCK(kernel_pmap); error = pmap_change_props_locked(va, size, PROT_NONE, mode, MAPDEV_FLUSHCACHE); PMAP_UNLOCK(kernel_pmap); return (error); } /* * Changes the specified virtual address range's protections to those * specified by "prot". Like pmap_change_attr(), protections for aliases * in the direct map are updated as well. Protections on aliasing mappings may * be a subset of the requested protections; for example, mappings in the direct * map are never executable. */ int pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) { int error; /* Only supported within the kernel map. */ if (va < VM_MIN_KERNEL_ADDRESS) return (EINVAL); PMAP_LOCK(kernel_pmap); error = pmap_change_props_locked(va, size, prot, -1, MAPDEV_ASSERTVALID); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, int mode, int flags) { vm_offset_t base, offset, tmpva; vm_paddr_t pa_start, pa_end, pa_end1; pdp_entry_t *pdpe; pd_entry_t *pde, pde_bits, pde_mask; pt_entry_t *pte, pte_bits, pte_mask; int error; bool changed; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses, including the direct * map but excluding the recursive map. */ if (base < DMAP_MIN_ADDRESS) return (EINVAL); /* * Construct our flag sets and masks. "bits" is the subset of * "mask" that will be set in each modified PTE. * * Mappings in the direct map are never allowed to be executable. */ pde_bits = pte_bits = 0; pde_mask = pte_mask = 0; if (mode != -1) { pde_bits |= pmap_cache_bits(kernel_pmap, mode, true); pde_mask |= X86_PG_PDE_CACHE; pte_bits |= pmap_cache_bits(kernel_pmap, mode, false); pte_mask |= X86_PG_PTE_CACHE; } if (prot != VM_PROT_NONE) { if ((prot & VM_PROT_WRITE) != 0) { pde_bits |= X86_PG_RW; pte_bits |= X86_PG_RW; } if ((prot & VM_PROT_EXECUTE) == 0 || va < VM_MIN_KERNEL_ADDRESS) { pde_bits |= pg_nx; pte_bits |= pg_nx; } pde_mask |= X86_PG_RW | pg_nx; pte_mask |= X86_PG_RW | pg_nx; } /* * Pages that aren't mapped aren't supported. Also break down 2MB pages * into 4KB pages if required. */ for (tmpva = base; tmpva < base + size; ) { pdpe = pmap_pdpe(kernel_pmap, tmpva); if (pdpe == NULL || *pdpe == 0) { KASSERT((flags & MAPDEV_ASSERTVALID) == 0, ("%s: addr %#lx is not mapped", __func__, tmpva)); return (EINVAL); } if (*pdpe & PG_PS) { /* * If the current 1GB page already has the required * properties, then we need not demote this page. Just * increment tmpva to the next 1GB page frame. */ if ((*pdpe & pde_mask) == pde_bits) { tmpva = trunc_1gpage(tmpva) + NBPDP; continue; } /* * If the current offset aligns with a 1GB page frame * and there is at least 1GB left within the range, then * we need not break down this page into 2MB pages. */ if ((tmpva & PDPMASK) == 0 && tmpva + PDPMASK < base + size) { tmpva += NBPDP; continue; } if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) return (ENOMEM); } pde = pmap_pdpe_to_pde(pdpe, tmpva); if (*pde == 0) { KASSERT((flags & MAPDEV_ASSERTVALID) == 0, ("%s: addr %#lx is not mapped", __func__, tmpva)); return (EINVAL); } if (*pde & PG_PS) { /* * If the current 2MB page already has the required * properties, then we need not demote this page. Just * increment tmpva to the next 2MB page frame. */ if ((*pde & pde_mask) == pde_bits) { tmpva = trunc_2mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2MB page frame * and there is at least 2MB left within the range, then * we need not break down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) return (ENOMEM); } pte = pmap_pde_to_pte(pde, tmpva); if (*pte == 0) { KASSERT((flags & MAPDEV_ASSERTVALID) == 0, ("%s: addr %#lx is not mapped", __func__, tmpva)); return (EINVAL); } tmpva += PAGE_SIZE; } error = 0; /* * Ok, all the pages exist, so run through them updating their * properties if required. */ changed = false; pa_start = pa_end = 0; for (tmpva = base; tmpva < base + size; ) { pdpe = pmap_pdpe(kernel_pmap, tmpva); if (*pdpe & PG_PS) { if ((*pdpe & pde_mask) != pde_bits) { pmap_pte_props(pdpe, pde_bits, pde_mask); changed = true; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pdpe & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pdpe & PG_PS_FRAME; pa_end = pa_start + NBPDP; } else if (pa_end == (*pdpe & PG_PS_FRAME)) pa_end += NBPDP; else { /* Run ended, update direct map. */ error = pmap_change_props_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, prot, mode, flags); if (error != 0) break; /* Start physical address run. */ pa_start = *pdpe & PG_PS_FRAME; pa_end = pa_start + NBPDP; } } tmpva = trunc_1gpage(tmpva) + NBPDP; continue; } pde = pmap_pdpe_to_pde(pdpe, tmpva); if (*pde & PG_PS) { if ((*pde & pde_mask) != pde_bits) { pmap_pte_props(pde, pde_bits, pde_mask); changed = true; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pde & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pde & PG_PS_FRAME; pa_end = pa_start + NBPDR; } else if (pa_end == (*pde & PG_PS_FRAME)) pa_end += NBPDR; else { /* Run ended, update direct map. */ error = pmap_change_props_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, prot, mode, flags); if (error != 0) break; /* Start physical address run. */ pa_start = *pde & PG_PS_FRAME; pa_end = pa_start + NBPDR; } } tmpva = trunc_2mpage(tmpva) + NBPDR; } else { pte = pmap_pde_to_pte(pde, tmpva); if ((*pte & pte_mask) != pte_bits) { pmap_pte_props(pte, pte_bits, pte_mask); changed = true; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pte & PG_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pte & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } else if (pa_end == (*pte & PG_FRAME)) pa_end += PAGE_SIZE; else { /* Run ended, update direct map. */ error = pmap_change_props_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, prot, mode, flags); if (error != 0) break; /* Start physical address run. */ pa_start = *pte & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } } tmpva += PAGE_SIZE; } } if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { pa_end1 = MIN(pa_end, dmaplimit); if (pa_start != pa_end1) error = pmap_change_props_locked(PHYS_TO_DMAP(pa_start), pa_end1 - pa_start, prot, mode, flags); } /* * Flush CPU caches if required to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); if ((flags & MAPDEV_FLUSHCACHE) != 0) pmap_invalidate_cache_range(base, tmpva); } return (error); } /* * Demotes any mapping within the direct map region that covers more than the * specified range of physical addresses. This range's size must be a power * of two and its starting address must be a multiple of its size. Since the * demotion does not change any attributes of the mapping, a TLB invalidation * is not mandatory. The caller may, however, request a TLB invalidation. */ void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) { pdp_entry_t *pdpe; pd_entry_t *pde; vm_offset_t va; boolean_t changed; if (len == 0) return; KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); KASSERT((base & (len - 1)) == 0, ("pmap_demote_DMAP: base is not a multiple of len")); if (len < NBPDP && base < dmaplimit) { va = PHYS_TO_DMAP(base); changed = FALSE; PMAP_LOCK(kernel_pmap); pdpe = pmap_pdpe(kernel_pmap, va); if ((*pdpe & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDPE"); if ((*pdpe & PG_PS) != 0) { if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) panic("pmap_demote_DMAP: PDPE failed"); changed = TRUE; } if (len < NBPDR) { pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDE"); if ((*pde & PG_PS) != 0) { if (!pmap_demote_pde(kernel_pmap, pde, va)) panic("pmap_demote_DMAP: PDE failed"); changed = TRUE; } } if (changed && invalidate) pmap_invalidate_page(kernel_pmap, va); PMAP_UNLOCK(kernel_pmap); } } /* * Perform the pmap work for mincore(2). If the page is not both referenced and * modified by this pmap, returns its physical address so that the caller can * find other mappings. */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { pdp_entry_t *pdpe; pd_entry_t *pdep; pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa; int val; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK(pmap); pte = 0; pa = 0; val = 0; pdpe = pmap_pdpe(pmap, addr); if ((*pdpe & PG_V) != 0) { if ((*pdpe & PG_PS) != 0) { pte = *pdpe; pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) & PG_FRAME; val = MINCORE_PSIND(2); } else { pdep = pmap_pde(pmap, addr); if (pdep != NULL && (*pdep & PG_V) != 0) { if ((*pdep & PG_PS) != 0) { pte = *pdep; /* Compute the physical address of the 4KB page. */ pa = ((pte & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_PSIND(1); } else { pte = *pmap_pde_to_pte(pdep, addr); pa = pte & PG_FRAME; val = 0; } } } } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { *pap = pa; } PMAP_UNLOCK(pmap); return (val); } static uint64_t pmap_pcid_alloc(pmap_t pmap, u_int cpuid) { uint32_t gen, new_gen, pcid_next; CRITICAL_ASSERT(curthread); gen = PCPU_GET(pcid_gen); if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN) return (pti ? 0 : CR3_PCID_SAVE); if (pmap->pm_pcids[cpuid].pm_gen == gen) return (CR3_PCID_SAVE); pcid_next = PCPU_GET(pcid_next); KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), ("cpu %d pcid_next %#x", cpuid, pcid_next)); if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { new_gen = gen + 1; if (new_gen == 0) new_gen = 1; PCPU_SET(pcid_gen, new_gen); pcid_next = PMAP_PCID_KERN + 1; } else { new_gen = gen; } pmap->pm_pcids[cpuid].pm_pcid = pcid_next; pmap->pm_pcids[cpuid].pm_gen = new_gen; PCPU_SET(pcid_next, pcid_next + 1); return (0); } static uint64_t pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid) { uint64_t cached; cached = pmap_pcid_alloc(pmap, cpuid); KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, ("pmap %p cpu %d pcid %#x", pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || pmap == kernel_pmap, ("non-kernel pmap pmap %p cpu %d pcid %#x", pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); return (cached); } static void pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap) { PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ? PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base; } static void pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) { pmap_t old_pmap; uint64_t cached, cr3, kcr3, ucr3; KASSERT((read_rflags() & PSL_I) == 0, ("PCID needs interrupts disabled in pmap_activate_sw()")); /* See the comment in pmap_invalidate_page_pcid(). */ if (PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) { PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); old_pmap = PCPU_GET(curpmap); MPASS(old_pmap->pm_ucr3 != PMAP_NO_CR3); old_pmap->pm_pcids[cpuid].pm_gen = 0; } cached = pmap_pcid_alloc_checked(pmap, cpuid); cr3 = rcr3(); if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid); PCPU_SET(curpmap, pmap); kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | PMAP_PCID_USER_PT; if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); if (cached) PCPU_INC(pm_save_cnt); pmap_activate_sw_pti_post(td, pmap); } static void pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap, u_int cpuid) { uint64_t cached, cr3; KASSERT((read_rflags() & PSL_I) == 0, ("PCID needs interrupts disabled in pmap_activate_sw()")); cached = pmap_pcid_alloc_checked(pmap, cpuid); cr3 = rcr3(); if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | cached); PCPU_SET(curpmap, pmap); if (cached) PCPU_INC(pm_save_cnt); } static void pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap, u_int cpuid __unused) { load_cr3(pmap->pm_cr3); PCPU_SET(curpmap, pmap); } static void pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap, u_int cpuid __unused) { pmap_activate_sw_nopcid_nopti(td, pmap, cpuid); PCPU_SET(kcr3, pmap->pm_cr3); PCPU_SET(ucr3, pmap->pm_ucr3); pmap_activate_sw_pti_post(td, pmap); } DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t, u_int)) { if (pmap_pcid_enabled && pti) return (pmap_activate_sw_pcid_pti); else if (pmap_pcid_enabled && !pti) return (pmap_activate_sw_pcid_nopti); else if (!pmap_pcid_enabled && pti) return (pmap_activate_sw_nopcid_pti); else /* if (!pmap_pcid_enabled && !pti) */ return (pmap_activate_sw_nopcid_nopti); } void pmap_activate_sw(struct thread *td) { pmap_t oldpmap, pmap; u_int cpuid; oldpmap = PCPU_GET(curpmap); pmap = vmspace_pmap(td->td_proc->p_vmspace); if (oldpmap == pmap) { if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); return; } cpuid = PCPU_GET(cpuid); #ifdef SMP CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_SET(cpuid, &pmap->pm_active); #endif pmap_activate_sw_mode(td, pmap, cpuid); #ifdef SMP CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); #endif } void pmap_activate(struct thread *td) { /* * invltlb_{invpcid,}_pcid_handler() is used to handle an * invalidate_all IPI, which checks for curpmap == * smp_tlb_pmap. The below sequence of operations has a * window where %CR3 is loaded with the new pmap's PML4 * address, but the curpmap value has not yet been updated. * This causes the invltlb IPI handler, which is called * between the updates, to execute as a NOP, which leaves * stale TLB entries. * * Note that the most common use of pmap_activate_sw(), from * a context switch, is immune to this race, because * interrupts are disabled (while the thread lock is owned), * so the IPI is delayed until after curpmap is updated. Protect * other callers in a similar way, by disabling interrupts * around the %cr3 register reload and curpmap assignment. */ spinlock_enter(); pmap_activate_sw(td); spinlock_exit(); } void pmap_activate_boot(pmap_t pmap) { uint64_t kcr3; u_int cpuid; /* * kernel_pmap must be never deactivated, and we ensure that * by never activating it at all. */ MPASS(pmap != kernel_pmap); cpuid = PCPU_GET(cpuid); #ifdef SMP CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_SET(cpuid, &pmap->pm_active); #endif PCPU_SET(curpmap, pmap); if (pti) { kcr3 = pmap->pm_cr3; if (pmap_pcid_enabled) kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE; } else { kcr3 = PMAP_NO_CR3; } PCPU_SET(kcr3, kcr3); PCPU_SET(ucr3, PMAP_NO_CR3); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } #ifdef INVARIANTS static unsigned long num_dirty_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, &num_dirty_emulations, 0, NULL); static unsigned long num_accessed_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, &num_accessed_emulations, 0, NULL); static unsigned long num_superpage_accessed_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, &num_superpage_accessed_emulations, 0, NULL); static unsigned long ad_emulation_superpage_promotions; SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, &ad_emulation_superpage_promotions, 0, NULL); #endif /* INVARIANTS */ int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) { int rv; struct rwlock *lock; #if VM_NRESERVLEVEL > 0 vm_page_t m, mpte; #endif pd_entry_t *pde; pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); if (!pmap_emulate_ad_bits(pmap)) return (-1); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); rv = -1; lock = NULL; PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) goto done; if ((*pde & PG_PS) != 0) { if (ftype == VM_PROT_READ) { #ifdef INVARIANTS atomic_add_long(&num_superpage_accessed_emulations, 1); #endif *pde |= PG_A; rv = 0; } goto done; } pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) goto done; if (ftype == VM_PROT_WRITE) { if ((*pte & PG_RW) == 0) goto done; /* * Set the modified and accessed bits simultaneously. * * Intel EPT PTEs that do software emulation of A/D bits map * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. * An EPT misconfiguration is triggered if the PTE is writable * but not readable (WR=10). This is avoided by setting PG_A * and PG_M simultaneously. */ *pte |= PG_M | PG_A; } else { *pte |= PG_A; } #if VM_NRESERVLEVEL > 0 /* try to promote the mapping */ if (va < VM_MAXUSER_ADDRESS) mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); else mpte = NULL; m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); if ((mpte == NULL || mpte->ref_count == NPTEPG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { pmap_promote_pde(pmap, pde, va, &lock); #ifdef INVARIANTS atomic_add_long(&ad_emulation_superpage_promotions, 1); #endif } #endif #ifdef INVARIANTS if (ftype == VM_PROT_WRITE) atomic_add_long(&num_dirty_emulations, 1); else atomic_add_long(&num_accessed_emulations, 1); #endif rv = 0; /* success */ done: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) { pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; pt_entry_t *pte, PG_V; int idx; idx = 0; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pml4 = pmap_pml4e(pmap, va); if (pml4 == NULL) goto done; ptr[idx++] = *pml4; if ((*pml4 & PG_V) == 0) goto done; pdp = pmap_pml4e_to_pdpe(pml4, va); ptr[idx++] = *pdp; if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) goto done; pde = pmap_pdpe_to_pde(pdp, va); ptr[idx++] = *pde; if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) goto done; pte = pmap_pde_to_pte(pde, va); ptr[idx++] = *pte; done: PMAP_UNLOCK(pmap); *num = idx; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; pt_entry_t *pte; int cache_bits, error __unused, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(paddr >= dmaplimit)) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); /* * NB: The sequence of updating a page table followed by accesses * to the corresponding pages used in the !DMAP case is subject to * the situation described in the "AMD64 Architecture Programmer's * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special * Coherency Considerations". Therefore, issuing the INVLPG right * after modifying the PTE bits is crucial. */ if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= dmaplimit) { if (can_fault) { /* * Slow path, since we can get page faults * while mappings are active don't pin the * thread to the CPU and instead add a global * mapping visible to all CPUs. */ pmap_qenter(vaddr[i], &page[i], 1); } else { pte = vtopte(vaddr[i]); cache_bits = pmap_cache_bits(kernel_pmap, page[i]->md.pat_mode, 0); pte_store(pte, paddr | X86_PG_RW | X86_PG_V | cache_bits); invlpg(vaddr[i]); } } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= dmaplimit) { if (can_fault) pmap_qremove(vaddr[i], 1); vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); } } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { vm_paddr_t paddr; paddr = VM_PAGE_TO_PHYS(m); if (paddr < dmaplimit) return (PHYS_TO_DMAP(paddr)); mtx_lock_spin(&qframe_mtx); KASSERT(*vtopte(qframe) == 0, ("qframe busy")); pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); return (qframe); } void pmap_quick_remove_page(vm_offset_t addr) { if (addr != qframe) return; pte_store(vtopte(qframe), 0); invlpg(qframe); mtx_unlock_spin(&qframe_mtx); } /* * Pdp pages from the large map are managed differently from either * kernel or user page table pages. They are permanently allocated at * initialization time, and their reference count is permanently set to * zero. The pml4 entries pointing to those pages are copied into * each allocated pmap. * * In contrast, pd and pt pages are managed like user page table * pages. They are dynamically allocated, and their reference count * represents the number of valid entries within the page. */ static vm_page_t pmap_large_map_getptp_unlocked(void) { vm_page_t m; m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_ZERO); if (m != NULL && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } static vm_page_t pmap_large_map_getptp(void) { vm_page_t m; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); m = pmap_large_map_getptp_unlocked(); if (m == NULL) { PMAP_UNLOCK(kernel_pmap); vm_wait(NULL); PMAP_LOCK(kernel_pmap); /* Callers retry. */ } return (m); } static pdp_entry_t * pmap_large_map_pdpe(vm_offset_t va) { vm_pindex_t pml4_idx; vm_paddr_t mphys; pml4_idx = pmap_pml4e_index(va); KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " "%#jx lm_ents %d", (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " "LMSPML4I %#jx lm_ents %d", (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); mphys = kernel_pml4[pml4_idx] & PG_FRAME; return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); } static pd_entry_t * pmap_large_map_pde(vm_offset_t va) { pdp_entry_t *pdpe; vm_page_t m; vm_paddr_t mphys; retry: pdpe = pmap_large_map_pdpe(va); if (*pdpe == 0) { m = pmap_large_map_getptp(); if (m == NULL) goto retry; mphys = VM_PAGE_TO_PHYS(m); *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; } else { MPASS((*pdpe & X86_PG_PS) == 0); mphys = *pdpe & PG_FRAME; } return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va)); } static pt_entry_t * pmap_large_map_pte(vm_offset_t va) { pd_entry_t *pde; vm_page_t m; vm_paddr_t mphys; retry: pde = pmap_large_map_pde(va); if (*pde == 0) { m = pmap_large_map_getptp(); if (m == NULL) goto retry; mphys = VM_PAGE_TO_PHYS(m); *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->ref_count++; } else { MPASS((*pde & X86_PG_PS) == 0); mphys = *pde & PG_FRAME; } return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va)); } static vm_paddr_t pmap_large_map_kextract(vm_offset_t va) { pdp_entry_t *pdpe, pdp; pd_entry_t *pde, pd; pt_entry_t *pte, pt; KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va), ("not largemap range %#lx", (u_long)va)); pdpe = pmap_large_map_pdpe(va); pdp = *pdpe; KASSERT((pdp & X86_PG_V) != 0, ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); if ((pdp & X86_PG_PS) != 0) { KASSERT((amd_feature & AMDID_PAGE1GB) != 0, ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK)); } pde = pmap_pdpe_to_pde(pdpe, va); pd = *pde; KASSERT((pd & X86_PG_V) != 0, ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); if ((pd & X86_PG_PS) != 0) return ((pd & PG_PS_FRAME) | (va & PDRMASK)); pte = pmap_pde_to_pte(pde, va); pt = *pte; KASSERT((pt & X86_PG_V) != 0, ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt)); return ((pt & PG_FRAME) | (va & PAGE_MASK)); } static int pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase, vmem_addr_t *vmem_res) { /* * Large mappings are all but static. Consequently, there * is no point in waiting for an earlier allocation to be * freed. */ return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res)); } int pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, vm_memattr_t mattr) { pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte; vm_offset_t va, inc; vmem_addr_t vmem_res; vm_paddr_t pa; int error; if (len == 0 || spa + len < spa) return (EINVAL); /* See if DMAP can serve. */ if (spa + len <= dmaplimit) { va = PHYS_TO_DMAP(spa); *addr = (void *)va; return (pmap_change_attr(va, len, mattr)); } /* * No, allocate KVA. Fit the address with best possible * alignment for superpages. Fall back to worse align if * failed. */ error = ENOMEM; if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len, NBPDP) >= roundup2(spa, NBPDP) + NBPDP) error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK, &vmem_res); if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa, NBPDR) + NBPDR) error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK, &vmem_res); if (error != 0) error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res); if (error != 0) return (error); /* * Fill pagetable. PG_M is not pre-set, we scan modified bits * in the pagetable to minimize flushing. No need to * invalidate TLB, since we only update invalid entries. */ PMAP_LOCK(kernel_pmap); for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc, len -= inc) { if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP && (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) { pdpe = pmap_large_map_pdpe(va); MPASS(*pdpe == 0); *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, TRUE); inc = NBPDP; } else if (len >= NBPDR && (pa & PDRMASK) == 0 && (va & PDRMASK) == 0) { pde = pmap_large_map_pde(va); MPASS(*pde == 0); *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, TRUE); PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> ref_count++; inc = NBPDR; } else { pte = pmap_large_map_pte(va); MPASS(*pte == 0); *pte = pa | pg_g | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, FALSE); PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> ref_count++; inc = PAGE_SIZE; } } PMAP_UNLOCK(kernel_pmap); MPASS(len == 0); *addr = (void *)vmem_res; return (0); } void pmap_large_unmap(void *svaa, vm_size_t len) { vm_offset_t sva, va; vm_size_t inc; pdp_entry_t *pdpe, pdp; pd_entry_t *pde, pd; pt_entry_t *pte; vm_page_t m; struct spglist spgf; sva = (vm_offset_t)svaa; if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) return; SLIST_INIT(&spgf); KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) && PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1), ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len)); PMAP_LOCK(kernel_pmap); for (va = sva; va < sva + len; va += inc) { pdpe = pmap_large_map_pdpe(va); pdp = *pdpe; KASSERT((pdp & X86_PG_V) != 0, ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); if ((pdp & X86_PG_PS) != 0) { KASSERT((amd_feature & AMDID_PAGE1GB) != 0, ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); KASSERT((va & PDPMASK) == 0, ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); KASSERT(va + NBPDP <= sva + len, ("unmap covers partial 1GB page, sva %#lx va %#lx " "pdpe %#lx pdp %#lx len %#lx", sva, va, (u_long)pdpe, pdp, len)); *pdpe = 0; inc = NBPDP; continue; } pde = pmap_pdpe_to_pde(pdpe, va); pd = *pde; KASSERT((pd & X86_PG_V) != 0, ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); if ((pd & X86_PG_PS) != 0) { KASSERT((va & PDRMASK) == 0, ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); KASSERT(va + NBPDR <= sva + len, ("unmap covers partial 2MB page, sva %#lx va %#lx " "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde, pd, len)); pde_store(pde, 0); inc = NBPDR; m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); m->ref_count--; if (m->ref_count == 0) { *pdpe = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); } continue; } pte = pmap_pde_to_pte(pde, va); KASSERT((*pte & X86_PG_V) != 0, ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, *pte)); pte_clear(pte); inc = PAGE_SIZE; m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); m->ref_count--; if (m->ref_count == 0) { *pde = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); m->ref_count--; if (m->ref_count == 0) { *pdpe = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); } } } pmap_invalidate_range(kernel_pmap, sva, sva + len); PMAP_UNLOCK(kernel_pmap); vm_page_free_pages_toq(&spgf, false); vmem_free(large_vmem, sva, len); } static void pmap_large_map_wb_fence_mfence(void) { mfence(); } static void pmap_large_map_wb_fence_atomic(void) { atomic_thread_fence_seq_cst(); } static void pmap_large_map_wb_fence_nop(void) { } DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void)) { if (cpu_vendor_id != CPU_VENDOR_INTEL) return (pmap_large_map_wb_fence_mfence); else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | CPUID_STDEXT_CLFLUSHOPT)) == 0) return (pmap_large_map_wb_fence_atomic); else /* clflush is strongly enough ordered */ return (pmap_large_map_wb_fence_nop); } static void pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) { for (; len > 0; len -= cpu_clflush_line_size, va += cpu_clflush_line_size) clwb(va); } static void pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len) { for (; len > 0; len -= cpu_clflush_line_size, va += cpu_clflush_line_size) clflushopt(va); } static void pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len) { for (; len > 0; len -= cpu_clflush_line_size, va += cpu_clflush_line_size) clflush(va); } static void pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused) { } DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t)) { if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0) return (pmap_large_map_flush_range_clwb); else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) return (pmap_large_map_flush_range_clflushopt); else if ((cpu_feature & CPUID_CLFSH) != 0) return (pmap_large_map_flush_range_clflush); else return (pmap_large_map_flush_range_nop); } static void pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva) { volatile u_long *pe; u_long p; vm_offset_t va; vm_size_t inc; bool seen_other; for (va = sva; va < eva; va += inc) { inc = 0; if ((amd_feature & AMDID_PAGE1GB) != 0) { pe = (volatile u_long *)pmap_large_map_pdpe(va); p = *pe; if ((p & X86_PG_PS) != 0) inc = NBPDP; } if (inc == 0) { pe = (volatile u_long *)pmap_large_map_pde(va); p = *pe; if ((p & X86_PG_PS) != 0) inc = NBPDR; } if (inc == 0) { pe = (volatile u_long *)pmap_large_map_pte(va); p = *pe; inc = PAGE_SIZE; } seen_other = false; for (;;) { if ((p & X86_PG_AVAIL1) != 0) { /* * Spin-wait for the end of a parallel * write-back. */ cpu_spinwait(); p = *pe; /* * If we saw other write-back * occuring, we cannot rely on PG_M to * indicate state of the cache. The * PG_M bit is cleared before the * flush to avoid ignoring new writes, * and writes which are relevant for * us might happen after. */ seen_other = true; continue; } if ((p & X86_PG_M) != 0 || seen_other) { if (!atomic_fcmpset_long(pe, &p, (p & ~X86_PG_M) | X86_PG_AVAIL1)) /* * If we saw PG_M without * PG_AVAIL1, and then on the * next attempt we do not * observe either PG_M or * PG_AVAIL1, the other * write-back started after us * and finished before us. We * can rely on it doing our * work. */ continue; pmap_large_map_flush_range(va, inc); atomic_clear_long(pe, X86_PG_AVAIL1); } break; } maybe_yield(); } } /* * Write-back cache lines for the given address range. * * Must be called only on the range or sub-range returned from * pmap_large_map(). Must not be called on the coalesced ranges. * * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH * instructions support. */ void pmap_large_map_wb(void *svap, vm_size_t len) { vm_offset_t eva, sva; sva = (vm_offset_t)svap; eva = sva + len; pmap_large_map_wb_fence(); if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { pmap_large_map_flush_range(sva, len); } else { KASSERT(sva >= LARGEMAP_MIN_ADDRESS && eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); pmap_large_map_wb_large(sva, eva); } pmap_large_map_wb_fence(); } static vm_page_t pmap_pti_alloc_page(void) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY | VM_ALLOC_WIRED | VM_ALLOC_ZERO); return (m); } static bool pmap_pti_free_page(vm_page_t m) { KASSERT(m->ref_count > 0, ("page %p not referenced", m)); if (!vm_page_unwire_noq(m)) return (false); vm_page_free_zero(m); return (true); } static void pmap_pti_init(void) { vm_page_t pml4_pg; pdp_entry_t *pdpe; vm_offset_t va; int i; if (!pti) return; pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); VM_OBJECT_WLOCK(pti_obj); pml4_pg = pmap_pti_alloc_page(); pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { pdpe = pmap_pti_pdpe(va); pmap_pti_wire_pte(pdpe); } pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + sizeof(struct gate_descriptor) * NIDT, false); CPU_FOREACH(i) { /* Doublefault stack IST 1 */ va = __pcpu[i].pc_common_tss.tss_ist1 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); /* NMI stack IST 2 */ va = __pcpu[i].pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); /* MC# stack IST 3 */ va = __pcpu[i].pc_common_tss.tss_ist3 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); /* DB# stack IST 4 */ va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); } pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE, (vm_offset_t)etext, true); pti_finalized = true; VM_OBJECT_WUNLOCK(pti_obj); } SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL); static pdp_entry_t * pmap_pti_pdpe(vm_offset_t va) { pml4_entry_t *pml4e; pdp_entry_t *pdpe; vm_page_t m; vm_pindex_t pml4_idx; vm_paddr_t mphys; VM_OBJECT_ASSERT_WLOCKED(pti_obj); pml4_idx = pmap_pml4e_index(va); pml4e = &pti_pml4[pml4_idx]; m = NULL; if (*pml4e == 0) { if (pti_finalized) panic("pml4 alloc after finalization\n"); m = pmap_pti_alloc_page(); if (*pml4e != 0) { pmap_pti_free_page(m); mphys = *pml4e & ~PAGE_MASK; } else { mphys = VM_PAGE_TO_PHYS(m); *pml4e = mphys | X86_PG_RW | X86_PG_V; } } else { mphys = *pml4e & ~PAGE_MASK; } pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); return (pdpe); } static void pmap_pti_wire_pte(void *pte) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); m->ref_count++; } static void pmap_pti_unwire_pde(void *pde, bool only_ref) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); MPASS(m->ref_count > 0); MPASS(only_ref || m->ref_count > 1); pmap_pti_free_page(m); } static void pmap_pti_unwire_pte(void *pte, vm_offset_t va) { vm_page_t m; pd_entry_t *pde; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); MPASS(m->ref_count > 0); if (pmap_pti_free_page(m)) { pde = pmap_pti_pde(va); MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); *pde = 0; pmap_pti_unwire_pde(pde, false); } } static pd_entry_t * pmap_pti_pde(vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; vm_page_t m; vm_pindex_t pd_idx; vm_paddr_t mphys; VM_OBJECT_ASSERT_WLOCKED(pti_obj); pdpe = pmap_pti_pdpe(va); if (*pdpe == 0) { m = pmap_pti_alloc_page(); if (*pdpe != 0) { pmap_pti_free_page(m); MPASS((*pdpe & X86_PG_PS) == 0); mphys = *pdpe & ~PAGE_MASK; } else { mphys = VM_PAGE_TO_PHYS(m); *pdpe = mphys | X86_PG_RW | X86_PG_V; } } else { MPASS((*pdpe & X86_PG_PS) == 0); mphys = *pdpe & ~PAGE_MASK; } pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); pd_idx = pmap_pde_index(va); pde += pd_idx; return (pde); } static pt_entry_t * pmap_pti_pte(vm_offset_t va, bool *unwire_pde) { pd_entry_t *pde; pt_entry_t *pte; vm_page_t m; vm_paddr_t mphys; VM_OBJECT_ASSERT_WLOCKED(pti_obj); pde = pmap_pti_pde(va); if (unwire_pde != NULL) { *unwire_pde = true; pmap_pti_wire_pte(pde); } if (*pde == 0) { m = pmap_pti_alloc_page(); if (*pde != 0) { pmap_pti_free_page(m); MPASS((*pde & X86_PG_PS) == 0); mphys = *pde & ~(PAGE_MASK | pg_nx); } else { mphys = VM_PAGE_TO_PHYS(m); *pde = mphys | X86_PG_RW | X86_PG_V; if (unwire_pde != NULL) *unwire_pde = false; } } else { MPASS((*pde & X86_PG_PS) == 0); mphys = *pde & ~(PAGE_MASK | pg_nx); } pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); pte += pmap_pte_index(va); return (pte); } static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) { vm_paddr_t pa; pd_entry_t *pde; pt_entry_t *pte, ptev; bool unwire_pde; VM_OBJECT_ASSERT_WLOCKED(pti_obj); sva = trunc_page(sva); MPASS(sva > VM_MAXUSER_ADDRESS); eva = round_page(eva); MPASS(sva < eva); for (; sva < eva; sva += PAGE_SIZE) { pte = pmap_pti_pte(sva, &unwire_pde); pa = pmap_kextract(sva); ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); if (*pte == 0) { pte_store(pte, ptev); pmap_pti_wire_pte(pte); } else { KASSERT(!pti_finalized, ("pti overlap after fin %#lx %#lx %#lx", sva, *pte, ptev)); KASSERT(*pte == ptev, ("pti non-identical pte after fin %#lx %#lx %#lx", sva, *pte, ptev)); } if (unwire_pde) { pde = pmap_pti_pde(sva); pmap_pti_unwire_pde(pde, true); } } } void pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) { if (!pti) return; VM_OBJECT_WLOCK(pti_obj); pmap_pti_add_kva_locked(sva, eva, exec); VM_OBJECT_WUNLOCK(pti_obj); } void pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) { pt_entry_t *pte; vm_offset_t va; if (!pti) return; sva = rounddown2(sva, PAGE_SIZE); MPASS(sva > VM_MAXUSER_ADDRESS); eva = roundup2(eva, PAGE_SIZE); MPASS(sva < eva); VM_OBJECT_WLOCK(pti_obj); for (va = sva; va < eva; va += PAGE_SIZE) { pte = pmap_pti_pte(va, NULL); KASSERT((*pte & X86_PG_V) != 0, ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, *pte)); pte_clear(pte); pmap_pti_unwire_pte(pte, va); } pmap_invalidate_range(kernel_pmap, sva, eva); VM_OBJECT_WUNLOCK(pti_obj); } static void * pkru_dup_range(void *ctx __unused, void *data) { struct pmap_pkru_range *node, *new_node; new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); if (new_node == NULL) return (NULL); node = data; memcpy(new_node, node, sizeof(*node)); return (new_node); } static void pkru_free_range(void *ctx __unused, void *node) { uma_zfree(pmap_pkru_ranges_zone, node); } static int pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags) { struct pmap_pkru_range *ppr; int error; PMAP_LOCK_ASSERT(pmap, MA_OWNED); MPASS(pmap->pm_type == PT_X86); MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); if ((flags & AMD64_PKRU_EXCL) != 0 && !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) return (EBUSY); ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); if (ppr == NULL) return (ENOMEM); ppr->pkru_keyidx = keyidx; ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); if (error != 0) uma_zfree(pmap_pkru_ranges_zone, ppr); return (error); } static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); MPASS(pmap->pm_type == PT_X86); MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); return (rangeset_remove(&pmap->pm_pkru, sva, eva)); } static void pmap_pkru_deassign_all(pmap_t pmap) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type == PT_X86 && (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) rangeset_remove_all(&pmap->pm_pkru); } static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct pmap_pkru_range *ppr, *prev_ppr; vm_offset_t va; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type != PT_X86 || (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || sva >= VM_MAXUSER_ADDRESS) return (true); MPASS(eva <= VM_MAXUSER_ADDRESS); for (va = sva; va < eva; prev_ppr = ppr) { ppr = rangeset_lookup(&pmap->pm_pkru, va); if (va == sva) prev_ppr = ppr; else if ((ppr == NULL) ^ (prev_ppr == NULL)) return (false); if (ppr == NULL) { va += PAGE_SIZE; continue; } if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx) return (false); va = ppr->pkru_rs_el.re_end; } return (true); } static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va) { struct pmap_pkru_range *ppr; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type != PT_X86 || (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || va >= VM_MAXUSER_ADDRESS) return (0); ppr = rangeset_lookup(&pmap->pm_pkru, va); if (ppr != NULL) return (X86_PG_PKU(ppr->pkru_keyidx)); return (0); } static bool pred_pkru_on_remove(void *ctx __unused, void *r) { struct pmap_pkru_range *ppr; ppr = r; return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); } static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type == PT_X86 && (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { rangeset_remove_pred(&pmap->pm_pkru, sva, eva, pred_pkru_on_remove); } } static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) { PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); MPASS(dst_pmap->pm_type == PT_X86); MPASS(src_pmap->pm_type == PT_X86); MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); if (src_pmap->pm_pkru.rs_data_ctx == NULL) return (0); return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); } static void pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx) { pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t newpde, ptpaddr, *pde; pt_entry_t newpte, *ptep, pte; vm_offset_t va, va_next; bool changed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); MPASS(pmap->pm_type == PT_X86); MPASS(keyidx <= PMAP_MAX_PKRU_IDX); for (changed = false, va = sva; va < eva; va = va_next) { pml4e = pmap_pml4e(pmap, va); if (pml4e == NULL || (*pml4e & X86_PG_V) == 0) { va_next = (va + NBPML4) & ~PML4MASK; if (va_next < va) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, va); if ((*pdpe & X86_PG_V) == 0) { va_next = (va + NBPDP) & ~PDPMASK; if (va_next < va) va_next = eva; continue; } va_next = (va + NBPDR) & ~PDRMASK; if (va_next < va) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, va); ptpaddr = *pde; if (ptpaddr == 0) continue; MPASS((ptpaddr & X86_PG_V) != 0); if ((ptpaddr & PG_PS) != 0) { if (va + NBPDR == va_next && eva >= va_next) { newpde = (ptpaddr & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); if (newpde != ptpaddr) { *pde = newpde; changed = true; } continue; } else if (!pmap_demote_pde(pmap, pde, va)) { continue; } } if (va_next > eva) va_next = eva; for (ptep = pmap_pde_to_pte(pde, va); va != va_next; ptep++, va += PAGE_SIZE) { pte = *ptep; if ((pte & X86_PG_V) == 0) continue; newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); if (newpte != pte) { *ptep = newpte; changed = true; } } } if (changed) pmap_invalidate_range(pmap, sva, eva); } static int pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags) { if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) return (EINVAL); if (eva <= sva || eva > VM_MAXUSER_ADDRESS) return (EFAULT); if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) return (ENOTSUP); return (0); } int pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags) { int error; sva = trunc_page(sva); eva = round_page(eva); error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); if (error != 0) return (error); for (;;) { PMAP_LOCK(pmap); error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); if (error == 0) pmap_pkru_update_range(pmap, sva, eva, keyidx); PMAP_UNLOCK(pmap); if (error != ENOMEM) break; vm_wait(NULL); } return (error); } int pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { int error; sva = trunc_page(sva); eva = round_page(eva); error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); if (error != 0) return (error); for (;;) { PMAP_LOCK(pmap); error = pmap_pkru_deassign(pmap, sva, eva); if (error == 0) pmap_pkru_update_range(pmap, sva, eva, 0); PMAP_UNLOCK(pmap); if (error != ENOMEM) break; vm_wait(NULL); } return (error); } /* * Track a range of the kernel's virtual address space that is contiguous * in various mapping attributes. */ struct pmap_kernel_map_range { vm_offset_t sva; pt_entry_t attrs; int ptes; int pdes; int pdpes; }; static void sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t eva) { const char *mode; int i, pat_idx; if (eva <= range->sva) return; pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); for (i = 0; i < PAT_INDEX_SIZE; i++) if (pat_index[i] == pat_idx) break; switch (i) { case PAT_WRITE_BACK: mode = "WB"; break; case PAT_WRITE_THROUGH: mode = "WT"; break; case PAT_UNCACHEABLE: mode = "UC"; break; case PAT_UNCACHED: mode = "U-"; break; case PAT_WRITE_PROTECTED: mode = "WP"; break; case PAT_WRITE_COMBINING: mode = "WC"; break; default: printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n", __func__, pat_idx, range->sva, eva); mode = "??"; break; } sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n", range->sva, eva, (range->attrs & X86_PG_RW) != 0 ? 'w' : '-', (range->attrs & pg_nx) != 0 ? '-' : 'x', (range->attrs & X86_PG_U) != 0 ? 'u' : 's', (range->attrs & X86_PG_G) != 0 ? 'g' : '-', mode, range->pdpes, range->pdes, range->ptes); /* Reset to sentinel value. */ range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); } /* * Determine whether the attributes specified by a page table entry match those * being tracked by the current range. This is not quite as simple as a direct * flag comparison since some PAT modes have multiple representations. */ static bool sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) { pt_entry_t diff, mask; mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx; diff = (range->attrs ^ attrs) & mask; if (diff == 0) return (true); if ((diff & ~X86_PG_PDE_PAT) == 0 && pmap_pat_index(kernel_pmap, range->attrs, true) == pmap_pat_index(kernel_pmap, attrs, true)) return (true); return (false); } static void sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, pt_entry_t attrs) { memset(range, 0, sizeof(*range)); range->sva = va; range->attrs = attrs; } /* * Given a leaf PTE, derive the mapping's attributes. If they do not match * those of the current run, dump the address range and its attributes, and * begin a new run. */ static void sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, pt_entry_t pte) { pt_entry_t attrs; attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); attrs |= pdpe & pg_nx; attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); if ((pdpe & PG_PS) != 0) { attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE); } else if (pde != 0) { attrs |= pde & pg_nx; attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U)); } if ((pde & PG_PS) != 0) { attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE); } else if (pte != 0) { attrs |= pte & pg_nx; attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U)); attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE); /* Canonicalize by always using the PDE PAT bit. */ if ((attrs & X86_PG_PTE_PAT) != 0) attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT; } if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { sysctl_kmaps_dump(sb, range, va); sysctl_kmaps_reinit(range, va, attrs); } } static int sysctl_kmaps(SYSCTL_HANDLER_ARGS) { struct pmap_kernel_map_range range; struct sbuf sbuf, *sb; pml4_entry_t pml4e; pdp_entry_t *pdp, pdpe; pd_entry_t *pd, pde; pt_entry_t *pt, pte; vm_offset_t sva; vm_paddr_t pa; int error, i, j, k, l; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = &sbuf; sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); /* Sentinel value. */ range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); /* * Iterate over the kernel page tables without holding the kernel pmap * lock. Outside of the large map, kernel page table pages are never * freed, so at worst we will observe inconsistencies in the output. * Within the large map, ensure that PDP and PD page addresses are * valid before descending. */ for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { switch (i) { case PML4PML4I: sbuf_printf(sb, "\nRecursive map:\n"); break; case DMPML4I: sbuf_printf(sb, "\nDirect map:\n"); break; case KPML4BASE: sbuf_printf(sb, "\nKernel map:\n"); break; case LMSPML4I: sbuf_printf(sb, "\nLarge map:\n"); break; } /* Convert to canonical form. */ if (sva == 1ul << 47) sva |= -1ul << 48; restart: pml4e = kernel_pml4[i]; if ((pml4e & X86_PG_V) == 0) { sva = rounddown2(sva, NBPML4); sysctl_kmaps_dump(sb, &range, sva); sva += NBPML4; continue; } pa = pml4e & PG_FRAME; pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa); for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) { pdpe = pdp[j]; if ((pdpe & X86_PG_V) == 0) { sva = rounddown2(sva, NBPDP); sysctl_kmaps_dump(sb, &range, sva); sva += NBPDP; continue; } pa = pdpe & PG_FRAME; if (PMAP_ADDRESS_IN_LARGEMAP(sva) && vm_phys_paddr_to_vm_page(pa) == NULL) goto restart; if ((pdpe & PG_PS) != 0) { sva = rounddown2(sva, NBPDP); sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, 0, 0); range.pdpes++; sva += NBPDP; continue; } pd = (pd_entry_t *)PHYS_TO_DMAP(pa); for (k = pmap_pde_index(sva); k < NPDEPG; k++) { pde = pd[k]; if ((pde & X86_PG_V) == 0) { sva = rounddown2(sva, NBPDR); sysctl_kmaps_dump(sb, &range, sva); sva += NBPDR; continue; } pa = pde & PG_FRAME; if (PMAP_ADDRESS_IN_LARGEMAP(sva) && vm_phys_paddr_to_vm_page(pa) == NULL) goto restart; if ((pde & PG_PS) != 0) { sva = rounddown2(sva, NBPDR); sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, pde, 0); range.pdes++; sva += NBPDR; continue; } pt = (pt_entry_t *)PHYS_TO_DMAP(pa); for (l = pmap_pte_index(sva); l < NPTEPG; l++, sva += PAGE_SIZE) { pte = pt[l]; if ((pte & X86_PG_V) == 0) { sysctl_kmaps_dump(sb, &range, sva); continue; } sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, pde, pte); range.ptes++; } } } } error = sbuf_finish(sb); sbuf_delete(sb); return (error); } SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_kmaps, "A", "Dump kernel address layout"); #ifdef DDB DB_SHOW_COMMAND(pte, pmap_print_pte) { pmap_t pmap; pml5_entry_t *pml5; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; pt_entry_t *pte, PG_V; vm_offset_t va; if (!have_addr) { db_printf("show pte addr\n"); return; } va = (vm_offset_t)addr; if (kdb_thread != NULL) pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); else pmap = PCPU_GET(curpmap); PG_V = pmap_valid_bit(pmap); db_printf("VA 0x%016lx", va); if (pmap_is_la57(pmap)) { pml5 = pmap_pml5e(pmap, va); db_printf(" pml5e 0x%016lx", *pml5); if ((*pml5 & PG_V) == 0) { db_printf("\n"); return; } pml4 = pmap_pml5e_to_pml4e(pml5, va); } else { pml4 = pmap_pml4e(pmap, va); } db_printf(" pml4e 0x%016lx", *pml4); if ((*pml4 & PG_V) == 0) { db_printf("\n"); return; } pdp = pmap_pml4e_to_pdpe(pml4, va); db_printf(" pdpe 0x%016lx", *pdp); if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { db_printf("\n"); return; } pde = pmap_pdpe_to_pde(pdp, va); db_printf(" pde 0x%016lx", *pde); if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { db_printf("\n"); return; } pte = pmap_pde_to_pte(pde, va); db_printf(" pte 0x%016lx\n", *pte); } DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) { vm_paddr_t a; if (have_addr) { a = (vm_paddr_t)addr; db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); } else { db_printf("show phys2dmap addr\n"); } } static void ptpages_show_page(int level, int idx, vm_page_t pg) { db_printf("l %d i %d pg %p phys %#lx ref %x\n", level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count); } static void ptpages_show_complain(int level, int idx, uint64_t pte) { db_printf("l %d i %d pte %#lx\n", level, idx, pte); } static void ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V) { vm_page_t pg3, pg2, pg1; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pd; int i4, i3, i2; pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4)); for (i4 = 0; i4 < num_entries; i4++) { if ((pml4[i4] & PG_V) == 0) continue; pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME); if (pg3 == NULL) { ptpages_show_complain(3, i4, pml4[i4]); continue; } ptpages_show_page(3, i4, pg3); pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3)); for (i3 = 0; i3 < NPDPEPG; i3++) { if ((pdp[i3] & PG_V) == 0) continue; pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME); if (pg3 == NULL) { ptpages_show_complain(2, i3, pdp[i3]); continue; } ptpages_show_page(2, i3, pg2); pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2)); for (i2 = 0; i2 < NPDEPG; i2++) { if ((pd[i2] & PG_V) == 0) continue; pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME); if (pg1 == NULL) { ptpages_show_complain(1, i2, pd[i2]); continue; } ptpages_show_page(1, i2, pg1); } } } } DB_SHOW_COMMAND(ptpages, pmap_ptpages) { pmap_t pmap; vm_page_t pg; pml5_entry_t *pml5; uint64_t PG_V; int i5; if (have_addr) pmap = (pmap_t)addr; else pmap = PCPU_GET(curpmap); PG_V = pmap_valid_bit(pmap); if (pmap_is_la57(pmap)) { pml5 = pmap->pm_pmltop; for (i5 = 0; i5 < NUPML5E; i5++) { if ((pml5[i5] & PG_V) == 0) continue; pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME); if (pg == NULL) { ptpages_show_complain(4, i5, pml5[i5]); continue; } ptpages_show_page(4, i5, pg); ptpages_show_pml4(pg, NPML4EPG, PG_V); } } else { ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS( (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V); } } #endif Index: head/sys/amd64/amd64/uma_machdep.c =================================================================== --- head/sys/amd64/amd64/uma_machdep.c (revision 366710) +++ head/sys/amd64/amd64/uma_machdep.c (revision 366711) @@ -1,79 +1,80 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003 Alan L. Cox * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include +#include #include #include +#include #include #include #include -#include void * uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, int wait) { vm_page_t m; vm_paddr_t pa; void *va; *flags = UMA_SLAB_PRIV; m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) return (NULL); pa = m->phys_addr; if ((wait & M_NODUMP) == 0) dump_add_page(pa); va = (void *)PHYS_TO_DMAP(pa); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) pagezero(va); return (va); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; vm_paddr_t pa; pa = DMAP_TO_PHYS((vm_offset_t)mem); dump_drop_page(pa); m = PHYS_TO_VM_PAGE(pa); vm_page_unwire_noq(m); vm_page_free(m); } Index: head/sys/arm/arm/mem.c =================================================================== --- head/sys/arm/arm/mem.c (revision 366710) +++ head/sys/arm/arm/mem.c (revision 366711) @@ -1,181 +1,181 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988 University of Utah. * Copyright (c) 1982, 1986, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department, and code derived from software contributed to * Berkeley by William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: mem.c 1.13 89/10/08$ * from: @(#)mem.c 7.2 (Berkeley) 5/9/91 */ #include __FBSDID("$FreeBSD$"); /* * Memory special file */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include -#include /* * Used in /dev/mem drivers and elsewhere */ MALLOC_DEFINE(M_MEMDESC, "memdesc", "memory range descriptors"); struct mem_range_softc mem_range_softc; static struct sx tmppt_lock; SX_SYSINIT(tmppt, &tmppt_lock, "mem4map"); /* ARGSUSED */ int memrw(struct cdev *dev, struct uio *uio, int flags) { int o; u_int c = 0, v; struct iovec *iov; int error = 0; vm_offset_t addr, eaddr; while (uio->uio_resid > 0 && error == 0) { iov = uio->uio_iov; if (iov->iov_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; if (uio->uio_iovcnt < 0) panic("memrw"); continue; } if (dev2unit(dev) == CDEV_MINOR_MEM) { int i; int address_valid = 0; v = uio->uio_offset; v &= ~PAGE_MASK; for (i = 0; dump_avail[i] || dump_avail[i + 1]; i += 2) { if (v >= dump_avail[i] && v < dump_avail[i + 1]) { address_valid = 1; break; } } if (!address_valid) return (EINVAL); sx_xlock(&tmppt_lock); pmap_kenter((vm_offset_t)_tmppt, v); #if __ARM_ARCH >= 6 pmap_tlb_flush(kernel_pmap, (vm_offset_t)_tmppt); #endif o = (int)uio->uio_offset & PAGE_MASK; c = (u_int)(PAGE_SIZE - ((int)iov->iov_base & PAGE_MASK)); c = min(c, (u_int)(PAGE_SIZE - o)); c = min(c, (u_int)iov->iov_len); error = uiomove((caddr_t)&_tmppt[o], (int)c, uio); pmap_qremove((vm_offset_t)_tmppt, 1); sx_xunlock(&tmppt_lock); continue; } else if (dev2unit(dev) == CDEV_MINOR_KMEM) { c = iov->iov_len; /* * Make sure that all of the pages are currently * resident so that we don't create any zero-fill * pages. */ addr = trunc_page(uio->uio_offset); eaddr = round_page(uio->uio_offset + c); for (; addr < eaddr; addr += PAGE_SIZE) if (pmap_extract(kernel_pmap, addr) == 0) return (EFAULT); if (!kernacc((caddr_t)(int)uio->uio_offset, c, uio->uio_rw == UIO_READ ? VM_PROT_READ : VM_PROT_WRITE)) return (EFAULT); error = uiomove((caddr_t)(int)uio->uio_offset, (int)c, uio); continue; } /* else panic! */ } return (error); } /* * allow user processes to MMAP some memory sections * instead of going through read/write */ /* ARGSUSED */ int memmmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int prot __unused, vm_memattr_t *memattr __unused) { if (dev2unit(dev) == CDEV_MINOR_MEM) { *paddr = offset; return (0); } return (-1); } int memioctl_md(struct cdev *dev __unused, u_long cmd __unused, caddr_t data __unused, int flags __unused, struct thread *td __unused) { return (ENOTTY); } Index: head/sys/arm/arm/minidump_machdep.c =================================================================== --- head/sys/arm/arm/minidump_machdep.c (revision 366710) +++ head/sys/arm/arm/minidump_machdep.c (revision 366711) @@ -1,342 +1,343 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Peter Wemm * Copyright (c) 2008 Semihalf, Grzegorz Bernacki * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: FreeBSD: src/sys/i386/i386/minidump_machdep.c,v 1.6 2008/08/17 23:27:27 */ #include __FBSDID("$FreeBSD$"); #include "opt_watchdog.h" #include #include #include #include #include #include #include #ifdef SW_WATCHDOG #include #endif #include #include #include #include +#include #include #include #include #include #include #include CTASSERT(sizeof(struct kerneldumpheader) == 512); static struct kerneldumpheader kdh; /* Handle chunked writes. */ static size_t fragsz; static void *dump_va; static uint64_t counter, progress; static int is_dumpable(vm_paddr_t pa) { int i; for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) return (1); } return (0); } #define PG2MB(pgs) (((pgs) + (1 << 8) - 1) >> 8) static int blk_flush(struct dumperinfo *di) { int error; if (fragsz == 0) return (0); error = dump_append(di, dump_va, 0, fragsz); fragsz = 0; return (error); } static int blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz) { size_t len; int error, i, c; u_int maxdumpsz; maxdumpsz = min(di->maxiosize, MAXDUMPPGS * PAGE_SIZE); if (maxdumpsz == 0) /* seatbelt */ maxdumpsz = PAGE_SIZE; error = 0; if (ptr != NULL && pa != 0) { printf("cant have both va and pa!\n"); return (EINVAL); } if (pa != 0) { if ((sz % PAGE_SIZE) != 0) { printf("size not page aligned\n"); return (EINVAL); } if ((pa & PAGE_MASK) != 0) { printf("address not page aligned\n"); return (EINVAL); } } if (ptr != NULL) { /* Flush any pre-existing pa pages before a virtual dump. */ error = blk_flush(di); if (error) return (error); } while (sz) { len = maxdumpsz - fragsz; if (len > sz) len = sz; counter += len; progress -= len; if (counter >> 22) { printf(" %lld", PG2MB(progress >> PAGE_SHIFT)); counter &= (1<<22) - 1; } #ifdef SW_WATCHDOG wdog_kern_pat(WD_LASTVAL); #endif if (ptr) { error = dump_append(di, ptr, 0, len); if (error) return (error); ptr += len; sz -= len; } else { for (i = 0; i < len; i += PAGE_SIZE) dump_va = pmap_kenter_temporary(pa + i, (i + fragsz) >> PAGE_SHIFT); fragsz += len; pa += len; sz -= len; if (fragsz == maxdumpsz) { error = blk_flush(di); if (error) return (error); } } /* Check for user abort. */ c = cncheckc(); if (c == 0x03) return (ECANCELED); if (c != -1) printf(" (CTRL-C to abort) "); } return (0); } /* A buffer for general use. Its size must be one page at least. */ static char dumpbuf[PAGE_SIZE]; CTASSERT(sizeof(dumpbuf) % sizeof(pt2_entry_t) == 0); int minidumpsys(struct dumperinfo *di) { struct minidumphdr mdhdr; uint64_t dumpsize; uint32_t ptesize; uint32_t pa, prev_pa = 0, count = 0; vm_offset_t va; int error; char *addr; /* * Flush caches. Note that in the SMP case this operates only on the * current CPU's L1 cache. Before we reach this point, code in either * the system shutdown or kernel debugger has called stop_cpus() to stop * all cores other than this one. Part of the ARM handling of * stop_cpus() is to call wbinv_all() on that core's local L1 cache. So * by time we get to here, all that remains is to flush the L1 for the * current CPU, then the L2. */ dcache_wbinv_poc_all(); counter = 0; /* Walk page table pages, set bits in vm_page_dump */ ptesize = 0; for (va = KERNBASE; va < kernel_vm_end; va += PAGE_SIZE) { pa = pmap_dump_kextract(va, NULL); if (pa != 0 && is_dumpable(pa)) dump_add_page(pa); ptesize += sizeof(pt2_entry_t); } /* Calculate dump size. */ dumpsize = ptesize; dumpsize += round_page(msgbufp->msg_size); dumpsize += round_page(sizeof(dump_avail)); dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages)); VM_PAGE_DUMP_FOREACH(pa) { /* Clear out undumpable pages now if needed */ if (is_dumpable(pa)) dumpsize += PAGE_SIZE; else dump_drop_page(pa); } dumpsize += PAGE_SIZE; progress = dumpsize; /* Initialize mdhdr */ bzero(&mdhdr, sizeof(mdhdr)); strcpy(mdhdr.magic, MINIDUMP_MAGIC); mdhdr.version = MINIDUMP_VERSION; mdhdr.msgbufsize = msgbufp->msg_size; mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); mdhdr.ptesize = ptesize; mdhdr.kernbase = KERNBASE; mdhdr.arch = __ARM_ARCH; #if __ARM_ARCH >= 6 mdhdr.mmuformat = MINIDUMP_MMU_FORMAT_V6; #else mdhdr.mmuformat = MINIDUMP_MMU_FORMAT_V4; #endif mdhdr.dumpavailsize = round_page(sizeof(dump_avail)); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_ARM_VERSION, dumpsize); error = dump_start(di, &kdh); if (error != 0) goto fail; printf("Physical memory: %u MB\n", ptoa((uintmax_t)physmem) / 1048576); printf("Dumping %llu MB:", (long long)dumpsize >> 20); /* Dump my header */ bzero(dumpbuf, sizeof(dumpbuf)); bcopy(&mdhdr, dumpbuf, sizeof(mdhdr)); error = blk_write(di, dumpbuf, 0, PAGE_SIZE); if (error) goto fail; /* Dump msgbuf up front */ error = blk_write(di, (char *)msgbufp->msg_ptr, 0, round_page(msgbufp->msg_size)); if (error) goto fail; /* Dump dump_avail */ _Static_assert(sizeof(dump_avail) <= sizeof(dumpbuf), "Large dump_avail not handled"); bzero(dumpbuf, sizeof(dumpbuf)); memcpy(dumpbuf, dump_avail, sizeof(dump_avail)); error = blk_write(di, dumpbuf, 0, PAGE_SIZE); if (error) goto fail; /* Dump bitmap */ error = blk_write(di, (char *)vm_page_dump, 0, round_page(BITSET_SIZE(vm_page_dump_pages))); if (error) goto fail; /* Dump kernel page table pages */ addr = dumpbuf; for (va = KERNBASE; va < kernel_vm_end; va += PAGE_SIZE) { pmap_dump_kextract(va, (pt2_entry_t *)addr); addr += sizeof(pt2_entry_t); if (addr == dumpbuf + sizeof(dumpbuf)) { error = blk_write(di, dumpbuf, 0, sizeof(dumpbuf)); if (error != 0) goto fail; addr = dumpbuf; } } if (addr != dumpbuf) { error = blk_write(di, dumpbuf, 0, addr - dumpbuf); if (error != 0) goto fail; } /* Dump memory chunks */ VM_PAGE_DUMP_FOREACH(pa) { if (!count) { prev_pa = pa; count++; } else { if (pa == (prev_pa + count * PAGE_SIZE)) count++; else { error = blk_write(di, NULL, prev_pa, count * PAGE_SIZE); if (error) goto fail; count = 1; prev_pa = pa; } } } if (count) { error = blk_write(di, NULL, prev_pa, count * PAGE_SIZE); if (error) goto fail; count = 0; prev_pa = 0; } error = blk_flush(di); if (error) goto fail; error = dump_finish(di, &kdh); if (error != 0) goto fail; printf("\nDump complete\n"); return (0); fail: if (error < 0) error = -error; if (error == ECANCELED) printf("\nDump aborted\n"); else if (error == E2BIG || error == ENOSPC) { printf("\nDump failed. Partition too small (about %lluMB were " "needed this time).\n", (long long)dumpsize >> 20); } else printf("\n** DUMP FAILED (ERROR %d) **\n", error); return (error); } Index: head/sys/arm64/arm64/minidump_machdep.c =================================================================== --- head/sys/arm64/arm64/minidump_machdep.c (revision 366710) +++ head/sys/arm64/arm64/minidump_machdep.c (revision 366711) @@ -1,416 +1,417 @@ /*- * Copyright (c) 2006 Peter Wemm * Copyright (c) 2015 The FreeBSD Foundation * All rights reserved. * * This software was developed by Andrew Turner under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include CTASSERT(sizeof(struct kerneldumpheader) == 512); static struct kerneldumpheader kdh; /* Handle chunked writes. */ static size_t fragsz; static void *dump_va; static size_t counter, progress, dumpsize; static uint64_t tmpbuffer[Ln_ENTRIES]; static int is_dumpable(vm_paddr_t pa) { vm_page_t m; int i; if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) return ((m->flags & PG_NODUMP) == 0); for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) return (1); } return (0); } static int blk_flush(struct dumperinfo *di) { int error; if (fragsz == 0) return (0); error = dump_append(di, dump_va, 0, fragsz); fragsz = 0; return (error); } static struct { int min_per; int max_per; int visited; } progress_track[10] = { { 0, 10, 0}, { 10, 20, 0}, { 20, 30, 0}, { 30, 40, 0}, { 40, 50, 0}, { 50, 60, 0}, { 60, 70, 0}, { 70, 80, 0}, { 80, 90, 0}, { 90, 100, 0} }; static void report_progress(size_t progress, size_t dumpsize) { int sofar, i; sofar = 100 - ((progress * 100) / dumpsize); for (i = 0; i < nitems(progress_track); i++) { if (sofar < progress_track[i].min_per || sofar > progress_track[i].max_per) continue; if (progress_track[i].visited) return; progress_track[i].visited = 1; printf("..%d%%", sofar); return; } } static int blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz) { size_t len; int error, c; u_int maxdumpsz; maxdumpsz = min(di->maxiosize, MAXDUMPPGS * PAGE_SIZE); if (maxdumpsz == 0) /* seatbelt */ maxdumpsz = PAGE_SIZE; error = 0; if ((sz % PAGE_SIZE) != 0) { printf("size not page aligned\n"); return (EINVAL); } if (ptr != NULL && pa != 0) { printf("cant have both va and pa!\n"); return (EINVAL); } if ((((uintptr_t)pa) % PAGE_SIZE) != 0) { printf("address not page aligned %p\n", ptr); return (EINVAL); } if (ptr != NULL) { /* * If we're doing a virtual dump, flush any * pre-existing pa pages. */ error = blk_flush(di); if (error) return (error); } while (sz) { len = maxdumpsz - fragsz; if (len > sz) len = sz; counter += len; progress -= len; if (counter >> 22) { report_progress(progress, dumpsize); counter &= (1 << 22) - 1; } wdog_kern_pat(WD_LASTVAL); if (ptr) { error = dump_append(di, ptr, 0, len); if (error) return (error); ptr += len; sz -= len; } else { dump_va = (void *)PHYS_TO_DMAP(pa); fragsz += len; pa += len; sz -= len; error = blk_flush(di); if (error) return (error); } /* Check for user abort. */ c = cncheckc(); if (c == 0x03) return (ECANCELED); if (c != -1) printf(" (CTRL-C to abort) "); } return (0); } int minidumpsys(struct dumperinfo *di) { struct minidumphdr mdhdr; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3; vm_offset_t va; vm_paddr_t pa; uint32_t pmapsize; int error, i, j, retry_count; retry_count = 0; retry: retry_count++; error = 0; pmapsize = 0; for (va = VM_MIN_KERNEL_ADDRESS; va < kernel_vm_end; va += L2_SIZE) { pmapsize += PAGE_SIZE; if (!pmap_get_tables(pmap_kernel(), va, &l0, &l1, &l2, &l3)) continue; if ((*l1 & ATTR_DESCR_MASK) == L1_BLOCK) { pa = *l1 & ~ATTR_MASK; for (i = 0; i < Ln_ENTRIES * Ln_ENTRIES; i++, pa += PAGE_SIZE) if (is_dumpable(pa)) dump_add_page(pa); pmapsize += (Ln_ENTRIES - 1) * PAGE_SIZE; va += L1_SIZE - L2_SIZE; } else if ((*l2 & ATTR_DESCR_MASK) == L2_BLOCK) { pa = *l2 & ~ATTR_MASK; for (i = 0; i < Ln_ENTRIES; i++, pa += PAGE_SIZE) { if (is_dumpable(pa)) dump_add_page(pa); } } else if ((*l2 & ATTR_DESCR_MASK) == L2_TABLE) { for (i = 0; i < Ln_ENTRIES; i++) { if ((l3[i] & ATTR_DESCR_MASK) != L3_PAGE) continue; pa = l3[i] & ~ATTR_MASK; if (is_dumpable(pa)) dump_add_page(pa); } } } /* Calculate dump size. */ dumpsize = pmapsize; dumpsize += round_page(msgbufp->msg_size); dumpsize += round_page(sizeof(dump_avail)); dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages)); VM_PAGE_DUMP_FOREACH(pa) { if (is_dumpable(pa)) dumpsize += PAGE_SIZE; else dump_drop_page(pa); } dumpsize += PAGE_SIZE; progress = dumpsize; /* Initialize mdhdr */ bzero(&mdhdr, sizeof(mdhdr)); strcpy(mdhdr.magic, MINIDUMP_MAGIC); mdhdr.version = MINIDUMP_VERSION; mdhdr.msgbufsize = msgbufp->msg_size; mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); mdhdr.pmapsize = pmapsize; mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS; mdhdr.dmapphys = DMAP_MIN_PHYSADDR; mdhdr.dmapbase = DMAP_MIN_ADDRESS; mdhdr.dmapend = DMAP_MAX_ADDRESS; mdhdr.dumpavailsize = round_page(sizeof(dump_avail)); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_AARCH64_VERSION, dumpsize); error = dump_start(di, &kdh); if (error != 0) goto fail; printf("Dumping %llu out of %ju MB:", (long long)dumpsize >> 20, ptoa((uintmax_t)physmem) / 1048576); /* Dump my header */ bzero(&tmpbuffer, sizeof(tmpbuffer)); bcopy(&mdhdr, &tmpbuffer, sizeof(mdhdr)); error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; /* Dump msgbuf up front */ error = blk_write(di, (char *)msgbufp->msg_ptr, 0, round_page(msgbufp->msg_size)); if (error) goto fail; /* Dump dump_avail */ _Static_assert(sizeof(dump_avail) <= sizeof(tmpbuffer), "Large dump_avail not handled"); bzero(tmpbuffer, sizeof(tmpbuffer)); memcpy(tmpbuffer, dump_avail, sizeof(dump_avail)); error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; /* Dump bitmap */ error = blk_write(di, (char *)vm_page_dump, 0, round_page(BITSET_SIZE(vm_page_dump_pages))); if (error) goto fail; /* Dump kernel page directory pages */ bzero(&tmpbuffer, sizeof(tmpbuffer)); for (va = VM_MIN_KERNEL_ADDRESS; va < kernel_vm_end; va += L2_SIZE) { if (!pmap_get_tables(pmap_kernel(), va, &l0, &l1, &l2, &l3)) { /* We always write a page, even if it is zero */ error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; /* flush, in case we reuse tmpbuffer in the same block*/ error = blk_flush(di); if (error) goto fail; } else if ((*l1 & ATTR_DESCR_MASK) == L1_BLOCK) { /* * Handle a 1GB block mapping: write out 512 fake L2 * pages. */ pa = (*l1 & ~ATTR_MASK) | (va & L1_OFFSET); for (i = 0; i < Ln_ENTRIES; i++) { for (j = 0; j < Ln_ENTRIES; j++) { tmpbuffer[j] = pa + i * L2_SIZE + j * PAGE_SIZE | ATTR_DEFAULT | L3_PAGE; } error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; } /* flush, in case we reuse tmpbuffer in the same block*/ error = blk_flush(di); if (error) goto fail; bzero(&tmpbuffer, sizeof(tmpbuffer)); va += L1_SIZE - L2_SIZE; } else if ((*l2 & ATTR_DESCR_MASK) == L2_BLOCK) { pa = (*l2 & ~ATTR_MASK) | (va & L2_OFFSET); /* Generate fake l3 entries based upon the l1 entry */ for (i = 0; i < Ln_ENTRIES; i++) { tmpbuffer[i] = pa + (i * PAGE_SIZE) | ATTR_DEFAULT | L3_PAGE; } error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; /* flush, in case we reuse fakepd in the same block */ error = blk_flush(di); if (error) goto fail; bzero(&tmpbuffer, sizeof(tmpbuffer)); continue; } else { pa = *l2 & ~ATTR_MASK; error = blk_write(di, NULL, pa, PAGE_SIZE); if (error) goto fail; } } /* Dump memory chunks */ VM_PAGE_DUMP_FOREACH(pa) { error = blk_write(di, 0, pa, PAGE_SIZE); if (error) goto fail; } error = blk_flush(di); if (error) goto fail; error = dump_finish(di, &kdh); if (error != 0) goto fail; printf("\nDump complete\n"); return (0); fail: if (error < 0) error = -error; printf("\n"); if (error == ENOSPC) { printf("Dump map grown while dumping. "); if (retry_count < 5) { printf("Retrying...\n"); goto retry; } printf("Dump failed.\n"); } else if (error == ECANCELED) printf("Dump aborted\n"); else if (error == E2BIG) { printf("Dump failed. Partition too small (about %lluMB were " "needed this time).\n", (long long)dumpsize >> 20); } else printf("** DUMP FAILED (ERROR %d) **\n", error); return (error); } Index: head/sys/arm64/arm64/pmap.c =================================================================== --- head/sys/arm64/arm64/pmap.c (revision 366710) +++ head/sys/arm64/arm64/pmap.c (revision 366711) @@ -1,6959 +1,6960 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * Copyright (c) 2014 Andrew Turner * All rights reserved. * Copyright (c) 2014-2016 The FreeBSD Foundation * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * This software was developed by Andrew Turner under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1) #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2) #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) #define NUL0E L0_ENTRIES #define NUL1E (NUL0E * NL1PG) #define NUL2E (NUL1E * NL2PG) #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT)) #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) static struct md_page * pa_to_pvh(vm_paddr_t pa) { struct vm_phys_seg *seg; int segind; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; if (pa >= seg->start && pa < seg->end) return ((struct md_page *)seg->md_first + pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start)); } panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa); } static struct md_page * page_to_pvh(vm_page_t m) { struct vm_phys_seg *seg; seg = &vm_phys_segs[m->segind]; return ((struct md_page *)seg->md_first + pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start)); } #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) /* * The presence of this flag indicates that the mapping is writeable. * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise * it is dirty. This flag may only be set on managed mappings. * * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it * as a software managed bit. */ #define ATTR_SW_DBM ATTR_DBM struct pmap kernel_pmap_store; /* Used for mapping ACPI memory before VM is initialized */ #define PMAP_PREINIT_MAPPING_COUNT 32 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE) static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */ static int vm_initialized = 0; /* No need to use pre-init maps when set */ /* * Reserve a few L2 blocks starting from 'preinit_map_va' pointer. * Always map entire L2 block for simplicity. * VA of L2 block = preinit_map_va + i * L2_SIZE */ static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t size; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ vm_offset_t kernel_vm_end = 0; /* * Data for the pv entry allocation mechanism. */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; vm_paddr_t dmap_phys_base; /* The start of the dmap region */ vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ /* This code assumes all L1 DMAP entries will be used */ CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS); CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS); #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) extern pt_entry_t pagetable_dmap[]; #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) static vm_paddr_t physmap[PHYSMAP_SIZE]; static u_int physmap_idx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM/pmap parameters"); /* * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs * that it has currently allocated to a pmap, a cursor ("asid_next") to * optimize its search for a free ASID in the bit vector, and an epoch number * ("asid_epoch") to indicate when it has reclaimed all previously allocated * ASIDs that are not currently active on a processor. * * The current epoch number is always in the range [0, INT_MAX). Negative * numbers and INT_MAX are reserved for special cases that are described * below. */ struct asid_set { int asid_bits; bitstr_t *asid_set; int asid_set_size; int asid_next; int asid_epoch; struct mtx asid_set_mutex; }; static struct asid_set asids; static struct asid_set vmids; static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "ASID allocator"); SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0, "The number of bits in an ASID"); SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0, "The last allocated ASID plus one"); SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0, "The current epoch number"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator"); SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0, "The number of bits in an VMID"); SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0, "The last allocated VMID plus one"); SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0, "The current epoch number"); void (*pmap_clean_stage2_tlbi)(void); void (*pmap_invalidate_vpipt_icache)(void); /* * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for * dynamically allocated ASIDs have a non-negative epoch number. * * An invalid ASID is represented by -1. * * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN), * which indicates that an ASID should never be allocated to the pmap, and * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be * allocated when the pmap is next activated. */ #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \ ((u_long)(epoch) << 32))) #define COOKIE_TO_ASID(cookie) ((int)(cookie)) #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32)) static int superpages_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, "Are large page mappings enabled?"); /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); static bool pmap_activate_int(pmap_t pmap); static void pmap_alloc_asid(pmap_t pmap); static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, struct rwlock **lockp); static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp); static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); static void pmap_reset_asid_set(pmap_t pmap); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); /* * These load the old table data and store the new value. * They need to be atomic as the System MMU may write to the table at * the same time as the CPU. */ #define pmap_clear(table) atomic_store_64(table, 0) #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits) #define pmap_load(table) (*table) #define pmap_load_clear(table) atomic_swap_64(table, 0) #define pmap_load_store(table, entry) atomic_swap_64(table, entry) #define pmap_set_bits(table, bits) atomic_set_64(table, bits) #define pmap_store(table, entry) atomic_store_64(table, entry) /********************/ /* Inline functions */ /********************/ static __inline void pagecopy(void *s, void *d) { memcpy(d, s, PAGE_SIZE); } static __inline pd_entry_t * pmap_l0(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_l0[pmap_l0_index(va)]); } static __inline pd_entry_t * pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) { pd_entry_t *l1; l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); return (&l1[pmap_l1_index(va)]); } static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { pd_entry_t *l0; l0 = pmap_l0(pmap, va); if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) return (NULL); return (pmap_l0_to_l1(l0, va)); } static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va) { pd_entry_t l1, *l2p; l1 = pmap_load(l1p); /* * The valid bit may be clear if pmap_update_entry() is concurrently * modifying the entry, so for KVA only the entry type may be checked. */ KASSERT(va >= VM_MAX_USER_ADDRESS || (l1 & ATTR_DESCR_VALID) != 0, ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va)); KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va)); l2p = (pd_entry_t *)PHYS_TO_DMAP(l1 & ~ATTR_MASK); return (&l2p[pmap_l2_index(va)]); } static __inline pd_entry_t * pmap_l2(pmap_t pmap, vm_offset_t va) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) return (NULL); return (pmap_l1_to_l2(l1, va)); } static __inline pt_entry_t * pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va) { pd_entry_t l2; pt_entry_t *l3p; l2 = pmap_load(l2p); /* * The valid bit may be clear if pmap_update_entry() is concurrently * modifying the entry, so for KVA only the entry type may be checked. */ KASSERT(va >= VM_MAX_USER_ADDRESS || (l2 & ATTR_DESCR_VALID) != 0, ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va)); KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va)); l3p = (pt_entry_t *)PHYS_TO_DMAP(l2 & ~ATTR_MASK); return (&l3p[pmap_l3_index(va)]); } /* * Returns the lowest valid pde for a given virtual address. * The next level may or may not point to a valid page or block. */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l0, *l1, *l2, desc; l0 = pmap_l0(pmap, va); desc = pmap_load(l0) & ATTR_DESCR_MASK; if (desc != L0_TABLE) { *level = -1; return (NULL); } l1 = pmap_l0_to_l1(l0, va); desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc != L1_TABLE) { *level = 0; return (l0); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc != L2_TABLE) { *level = 1; return (l1); } *level = 2; return (l2); } /* * Returns the lowest valid pte block or table entry for a given virtual * address. If there are no valid entries return NULL and set the level to * the first invalid level. */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l1, *l2, desc; pt_entry_t *l3; l1 = pmap_l1(pmap, va); if (l1 == NULL) { *level = 0; return (NULL); } desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc == L1_BLOCK) { *level = 1; return (l1); } if (desc != L1_TABLE) { *level = 1; return (NULL); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc == L2_BLOCK) { *level = 2; return (l2); } if (desc != L2_TABLE) { *level = 2; return (NULL); } *level = 3; l3 = pmap_l2_to_l3(l2, va); if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) return (NULL); return (l3); } bool pmap_ps_enabled(pmap_t pmap __unused) { return (superpages_enabled != 0); } bool pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, pd_entry_t **l2, pt_entry_t **l3) { pd_entry_t *l0p, *l1p, *l2p; if (pmap->pm_l0 == NULL) return (false); l0p = pmap_l0(pmap, va); *l0 = l0p; if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) return (false); l1p = pmap_l0_to_l1(l0p, va); *l1 = l1p; if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { *l2 = NULL; *l3 = NULL; return (true); } if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) return (false); l2p = pmap_l1_to_l2(l1p, va); *l2 = l2p; if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { *l3 = NULL; return (true); } if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE) return (false); *l3 = pmap_l2_to_l3(l2p, va); return (true); } static __inline int pmap_l3_valid(pt_entry_t l3) { return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); } CTASSERT(L1_BLOCK == L2_BLOCK); static pt_entry_t pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr) { pt_entry_t val; if (pmap->pm_stage == PM_STAGE1) { val = ATTR_S1_IDX(memattr); if (memattr == VM_MEMATTR_DEVICE) val |= ATTR_S1_XN; return (val); } val = 0; switch (memattr) { case VM_MEMATTR_DEVICE: return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) | ATTR_S2_XN(ATTR_S2_XN_ALL)); case VM_MEMATTR_UNCACHEABLE: return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC)); case VM_MEMATTR_WRITE_BACK: return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB)); case VM_MEMATTR_WRITE_THROUGH: return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT)); default: panic("%s: invalid memory attribute %x", __func__, memattr); } } static pt_entry_t pmap_pte_prot(pmap_t pmap, vm_prot_t prot) { pt_entry_t val; val = 0; if (pmap->pm_stage == PM_STAGE1) { if ((prot & VM_PROT_EXECUTE) == 0) val |= ATTR_S1_XN; if ((prot & VM_PROT_WRITE) == 0) val |= ATTR_S1_AP(ATTR_S1_AP_RO); } else { if ((prot & VM_PROT_WRITE) != 0) val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); if ((prot & VM_PROT_READ) != 0) val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ); if ((prot & VM_PROT_EXECUTE) == 0) val |= ATTR_S2_XN(ATTR_S2_XN_ALL); } return (val); } /* * Checks if the PTE is dirty. */ static inline int pmap_pte_dirty(pmap_t pmap, pt_entry_t pte) { KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte)); if (pmap->pm_stage == PM_STAGE1) { KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0, ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)); } return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } static pt_entry_t * pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, u_int *l2_slot) { pt_entry_t *l2; pd_entry_t *l1; l1 = (pd_entry_t *)l1pt; *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; /* Check locore has used a table L1 map */ KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE, ("Invalid bootstrap L1 table")); /* Find the address of the L2 table */ l2 = (pt_entry_t *)init_pt_va; *l2_slot = pmap_l2_index(va); return (l2); } static vm_paddr_t pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) { u_int l1_slot, l2_slot; pt_entry_t *l2; l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET)); } static vm_offset_t pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_offset_t freemempos) { pt_entry_t *l2; vm_offset_t va; vm_paddr_t l2_pa, pa; u_int l1_slot, l2_slot, prev_l1_slot; int i; dmap_phys_base = min_pa & ~L1_OFFSET; dmap_phys_max = 0; dmap_max_addr = 0; l2 = NULL; prev_l1_slot = -1; #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES); for (i = 0; i < (physmap_idx * 2); i += 2) { pa = physmap[i] & ~L2_OFFSET; va = pa - dmap_phys_base + DMAP_MIN_ADDRESS; /* Create L2 mappings at the start of the region */ if ((pa & L1_OFFSET) != 0) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); if (l1_slot != prev_l1_slot) { prev_l1_slot = l1_slot; l2 = (pt_entry_t *)freemempos; l2_pa = pmap_early_vtophys(kern_l1, (vm_offset_t)l2); freemempos += PAGE_SIZE; pmap_store(&pagetable_dmap[l1_slot], (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE); memset(l2, 0, PAGE_SIZE); } KASSERT(l2 != NULL, ("pmap_bootstrap_dmap: NULL l2 map")); for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; pa += L2_SIZE, va += L2_SIZE) { /* * We are on a boundary, stop to * create a level 1 block */ if ((pa & L1_OFFSET) == 0) break; l2_slot = pmap_l2_index(va); KASSERT(l2_slot != 0, ("...")); pmap_store(&l2[l2_slot], (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_S1_XN | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK); } KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS), ("...")); } for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] && (physmap[i + 1] - pa) >= L1_SIZE; pa += L1_SIZE, va += L1_SIZE) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); pmap_store(&pagetable_dmap[l1_slot], (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_S1_XN | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L1_BLOCK); } /* Create L2 mappings at the end of the region */ if (pa < physmap[i + 1]) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); if (l1_slot != prev_l1_slot) { prev_l1_slot = l1_slot; l2 = (pt_entry_t *)freemempos; l2_pa = pmap_early_vtophys(kern_l1, (vm_offset_t)l2); freemempos += PAGE_SIZE; pmap_store(&pagetable_dmap[l1_slot], (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE); memset(l2, 0, PAGE_SIZE); } KASSERT(l2 != NULL, ("pmap_bootstrap_dmap: NULL l2 map")); for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; pa += L2_SIZE, va += L2_SIZE) { l2_slot = pmap_l2_index(va); pmap_store(&l2[l2_slot], (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_S1_XN | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK); } } if (pa > dmap_phys_max) { dmap_phys_max = pa; dmap_max_addr = va; } } cpu_tlb_flushID(); return (freemempos); } static vm_offset_t pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start) { vm_offset_t l2pt; vm_paddr_t pa; pd_entry_t *l1; u_int l1_slot; KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); l1 = (pd_entry_t *)l1pt; l1_slot = pmap_l1_index(va); l2pt = l2_start; for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) { KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); pa = pmap_early_vtophys(l1pt, l2pt); pmap_store(&l1[l1_slot], (pa & ~Ln_TABLE_MASK) | L1_TABLE); l2pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l2_start, 0, l2pt - l2_start); return l2pt; } static vm_offset_t pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) { vm_offset_t l3pt; vm_paddr_t pa; pd_entry_t *l2; u_int l2_slot; KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); l2 = pmap_l2(kernel_pmap, va); l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE); l2_slot = pmap_l2_index(va); l3pt = l3_start; for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); pa = pmap_early_vtophys(l1pt, l3pt); pmap_store(&l2[l2_slot], (pa & ~Ln_TABLE_MASK) | ATTR_S1_UXN | L2_TABLE); l3pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l3_start, 0, l3pt - l3_start); return l3pt; } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) { vm_offset_t freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t start_pa, pa, min_pa; uint64_t kern_delta; int i; /* Verify that the ASID is set through TTBR0. */ KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0, ("pmap_bootstrap: TCR_EL1.A1 != 0")); kern_delta = KERNBASE - kernstart; printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); printf("%lx\n", l1pt); printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); /* Set this early so we can use the pagetable walking functions */ kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt; PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_l0_paddr = l0pt - kern_delta; kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN); kernel_pmap->pm_stage = PM_STAGE1; kernel_pmap->pm_asid_set = &asids; /* Assume the address we were loaded to is a valid physical address */ min_pa = KERNBASE - kern_delta; physmap_idx = physmem_avail(physmap, nitems(physmap)); physmap_idx /= 2; /* * Find the minimum physical address. physmap is sorted, * but may contain empty ranges. */ for (i = 0; i < physmap_idx * 2; i += 2) { if (physmap[i] == physmap[i + 1]) continue; if (physmap[i] <= min_pa) min_pa = physmap[i]; } freemempos = KERNBASE + kernlen; freemempos = roundup2(freemempos, PAGE_SIZE); /* Create a direct map region early so we can use it for pa -> va */ freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos); start_pa = pa = KERNBASE - kern_delta; /* * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the * loader allocated the first and only l2 page table page used to map * the kernel, preloaded files and module metadata. */ freemempos = pmap_bootstrap_l2(l1pt, KERNBASE + L1_SIZE, freemempos); /* And the l3 tables for the early devmap */ freemempos = pmap_bootstrap_l3(l1pt, VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos); cpu_tlb_flushID(); #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += (np * PAGE_SIZE); \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); /* Allocate dynamic per-cpu area. */ alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); dpcpu_init((void *)dpcpu, 0); /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); msgbufp = (void *)msgbufpv; /* Reserve some VA space for early BIOS/ACPI mapping */ preinit_map_va = roundup2(freemempos, L2_SIZE); virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE; virtual_avail = roundup2(virtual_avail, L1_SIZE); virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE); kernel_vm_end = virtual_avail; pa = pmap_early_vtophys(l1pt, freemempos); physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); cpu_tlb_flushID(); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; } static void pmap_init_asids(struct asid_set *set, int bits) { int i; set->asid_bits = bits; /* * We may be too early in the overall initialization process to use * bit_alloc(). */ set->asid_set_size = 1 << set->asid_bits; set->asid_set = (bitstr_t *)kmem_malloc(bitstr_size(set->asid_set_size), M_WAITOK | M_ZERO); for (i = 0; i < ASID_FIRST_AVAILABLE; i++) bit_set(set->asid_set, i); set->asid_next = ASID_FIRST_AVAILABLE; mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { struct vm_phys_seg *seg, *next_seg; struct md_page *pvh; vm_size_t s; uint64_t mmfr1; int i, pv_npg, vmid_bits; /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); if (superpages_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = L2_SIZE; KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, ("pmap_init: can't assign to pagesizes[2]")); pagesizes[2] = L1_SIZE; } /* * Initialize the ASID allocator. */ pmap_init_asids(&asids, (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8); if (has_hyp()) { mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); vmid_bits = 8; if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) == ID_AA64MMFR1_VMIDBits_16) vmid_bits = 16; pmap_init_asids(&vmids, vmid_bits); } /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = 0; for (i = 0; i < vm_phys_nsegs; i++) { seg = &vm_phys_segs[i]; pv_npg += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - pmap_l2_pindex(seg->start); } /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); /* * Set pointers from vm_phys_segs to pv_table. */ for (i = 0, pvh = pv_table; i < vm_phys_nsegs; i++) { seg = &vm_phys_segs[i]; seg->md_first = pvh; pvh += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - pmap_l2_pindex(seg->start); /* * If there is a following segment, and the final * superpage of this segment and the initial superpage * of the next segment are the same then adjust the * pv_table entry for that next segment down by one so * that the pv_table entries will be shared. */ if (i + 1 < vm_phys_nsegs) { next_seg = &vm_phys_segs[i + 1]; if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 == pmap_l2_pindex(next_seg->start)) { pvh--; } } } vm_initialized = 1; } static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "2MB page mapping counters"); static u_long pmap_l2_demotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, &pmap_l2_demotions, 0, "2MB page demotions"); static u_long pmap_l2_mappings; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, &pmap_l2_mappings, 0, "2MB page mappings"); static u_long pmap_l2_p_failures; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_l2_p_failures, 0, "2MB page promotion failures"); static u_long pmap_l2_promotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, &pmap_l2_promotions, 0, "2MB page promotions"); /* * Invalidate a single TLB entry. */ static __inline void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { uint64_t r; PMAP_ASSERT_STAGE1(pmap); dsb(ishst); if (pmap == kernel_pmap) { r = atop(va); __asm __volatile("tlbi vaae1is, %0" : : "r" (r)); } else { r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)) | atop(va); __asm __volatile("tlbi vae1is, %0" : : "r" (r)); } dsb(ish); isb(); } static __inline void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { uint64_t end, r, start; PMAP_ASSERT_STAGE1(pmap); dsb(ishst); if (pmap == kernel_pmap) { start = atop(sva); end = atop(eva); for (r = start; r < end; r++) __asm __volatile("tlbi vaae1is, %0" : : "r" (r)); } else { start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); start |= atop(sva); end |= atop(eva); for (r = start; r < end; r++) __asm __volatile("tlbi vae1is, %0" : : "r" (r)); } dsb(ish); isb(); } static __inline void pmap_invalidate_all(pmap_t pmap) { uint64_t r; PMAP_ASSERT_STAGE1(pmap); dsb(ishst); if (pmap == kernel_pmap) { __asm __volatile("tlbi vmalle1is"); } else { r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); __asm __volatile("tlbi aside1is, %0" : : "r" (r)); } dsb(ish); isb(); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte, tpte; vm_paddr_t pa; int lvl; pa = 0; PMAP_LOCK(pmap); /* * Find the block or page map for this virtual address. pmap_pte * will return either a valid block/page entry, or NULL. */ pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); pa = tpte & ~ATTR_MASK; switch(lvl) { case 1: KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_extract: Invalid L1 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L1_OFFSET); break; case 2: KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_extract: Invalid L2 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L2_OFFSET); break; case 3: KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("pmap_extract: Invalid L3 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L3_OFFSET); break; } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t *pte, tpte; vm_offset_t off; vm_page_t m; int lvl; bool use; m = NULL; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); KASSERT(lvl > 0 && lvl <= 3, ("pmap_extract_and_hold: Invalid level %d", lvl)); CTASSERT(L1_BLOCK == L2_BLOCK); KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, tpte & ATTR_DESCR_MASK)); use = false; if ((prot & VM_PROT_WRITE) == 0) use = true; else if (pmap->pm_stage == PM_STAGE1 && (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)) use = true; else if (pmap->pm_stage == PM_STAGE2 && ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE))) use = true; if (use) { switch (lvl) { case 1: off = va & L1_OFFSET; break; case 2: off = va & L2_OFFSET; break; case 3: default: off = 0; } m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off); if (m != NULL && !vm_page_wire_mapped(m)) m = NULL; } } PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pt_entry_t *pte, tpte; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) return (DMAP_TO_PHYS(va)); pte = pmap_l1(kernel_pmap, va); if (pte == NULL) return (0); /* * A concurrent pmap_update_entry() will clear the entry's valid bit * but leave the rest of the entry unchanged. Therefore, we treat a * non-zero entry as being valid, and we ignore the valid bit when * determining whether the entry maps a block, page, or table. */ tpte = pmap_load(pte); if (tpte == 0) return (0); if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) return ((tpte & ~ATTR_MASK) | (va & L1_OFFSET)); pte = pmap_l1_to_l2(&tpte, va); tpte = pmap_load(pte); if (tpte == 0) return (0); if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) return ((tpte & ~ATTR_MASK) | (va & L2_OFFSET)); pte = pmap_l2_to_l3(&tpte, va); tpte = pmap_load(pte); if (tpte == 0) return (0); return ((tpte & ~ATTR_MASK) | (va & L3_OFFSET)); } /*************************************************** * Low level mapping routines..... ***************************************************/ void pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) { pd_entry_t *pde; pt_entry_t *pte, attr; vm_offset_t va; int lvl; KASSERT((pa & L3_OFFSET) == 0, ("pmap_kenter: Invalid physical address")); KASSERT((sva & L3_OFFSET) == 0, ("pmap_kenter: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kenter: Mapping is not page-sized")); attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | ATTR_S1_IDX(mode) | L3_PAGE; va = sva; while (size != 0) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, (pa & ~L3_OFFSET) | attr); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } void pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) { pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); } /* * Remove a page from the kernel pagetables. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; int lvl; pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_kremove: Invalid address")); KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl)); pmap_clear(pte); pmap_invalidate_page(kernel_pmap, va); } void pmap_kremove_device(vm_offset_t sva, vm_size_t size) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT((sva & L3_OFFSET) == 0, ("pmap_kremove_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kremove_device: Mapping is not page-sized")); va = sva; while (size != 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va)); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); pmap_clear(pte); va += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pd_entry_t *pde; pt_entry_t *pte, pa; vm_offset_t va; vm_page_t m; int i, lvl; va = sva; for (i = 0; i < count; i++) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_qenter: Invalid level %d", lvl)); m = ma[i]; pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE; pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, pa); va += L3_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t sva, int count) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); va = sva; while (count-- > 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); if (pte != NULL) { pmap_clear(pte); } va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Decrements a page table page's reference count, which is used to record the * number of valid page table entries within the page. If the reference count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->ref_count; if (m->ref_count == 0) { _pmap_unwire_l3(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= (NUL2E + NUL1E)) { /* l1 page */ pd_entry_t *l0; l0 = pmap_l0(pmap, va); pmap_clear(l0); } else if (m->pindex >= NUL2E) { /* l2 page */ pd_entry_t *l1; l1 = pmap_l1(pmap, va); pmap_clear(l1); } else { /* l3 page */ pd_entry_t *l2; l2 = pmap_l2(pmap, va); pmap_clear(l2); } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUL2E) { /* We just released an l3, unhold the matching l2 */ pd_entry_t *l1, tl1; vm_page_t l2pg; l1 = pmap_l1(pmap, va); tl1 = pmap_load(l1); l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l2pg, free); } else if (m->pindex < (NUL2E + NUL1E)) { /* We just released an l2, unhold the matching l1 */ pd_entry_t *l0, tl0; vm_page_t l1pg; l0 = pmap_l0(pmap, va); tl0 = pmap_load(l0); l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l1pg, free); } pmap_invalidate_page(pmap, va); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the reference count. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK); return (pmap_unwire_l3(pmap, va, mpte, free)); } /* * Release a page table page reference after a failed attempt to create a * mapping. */ static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) { struct spglist free; SLIST_INIT(&free); if (pmap_unwire_l3(pmap, va, mpte, &free)) { /* * Although "va" was never mapped, the TLB could nonetheless * have intermediate entries that refer to the freed page * table pages. Invalidate those entries. * * XXX redundant invalidation (See _pmap_unwire_l3().) */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1); pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); pmap->pm_root.rt_root = 0; pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN); pmap->pm_stage = PM_STAGE1; pmap->pm_asid_set = &asids; PCPU_SET(curpmap, pmap); } int pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage) { vm_page_t l0pt; /* * allocate the l0 page */ while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) vm_wait(NULL); pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(l0pt); pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); if ((l0pt->flags & PG_ZERO) == 0) pagezero(pmap->pm_l0); pmap->pm_root.rt_root = 0; bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX); pmap->pm_stage = stage; switch (stage) { case PM_STAGE1: pmap->pm_asid_set = &asids; break; case PM_STAGE2: pmap->pm_asid_set = &vmids; break; default: panic("%s: Invalid pmap type %d", __func__, stage); break; } /* XXX Temporarily disable deferred ASID allocation. */ pmap_alloc_asid(pmap); return (1); } int pmap_pinit(pmap_t pmap) { return (pmap_pinit_stage(pmap, PM_STAGE1)); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, l1pg, l2pg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); vm_wait(NULL); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Because of AArch64's weak memory consistency model, we must have a * barrier here to ensure that the stores for zeroing "m", whether by * pmap_zero_page() or an earlier function, are visible before adding * "m" to the page table. Otherwise, a page table walk by another * processor's MMU could see the mapping to "m" and a stale, non-zero * PTE within "m". */ dmb(ishst); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUL2E + NUL1E)) { pd_entry_t *l0; vm_pindex_t l0index; l0index = ptepindex - (NUL2E + NUL1E); l0 = &pmap->pm_l0[l0index]; pmap_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE); } else if (ptepindex >= NUL2E) { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1; pd_entry_t tl0; l1index = ptepindex - NUL2E; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); l1pg->ref_count++; } l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); l1 = &l1[ptepindex & Ln_ADDR_MASK]; pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); } else { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1, *l2; pd_entry_t tl0, tl1; l1index = ptepindex >> Ln_ENTRIES_SHIFT; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } tl0 = pmap_load(l0); l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; } else { l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; tl1 = pmap_load(l1); if (tl1 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); l2pg->ref_count++; } } l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); l2 = &l2[ptepindex & Ln_ADDR_MASK]; pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE); } pmap_resident_count_inc(pmap, 1); return (m); } static pd_entry_t * pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp, struct rwlock **lockp) { pd_entry_t *l1, *l2; vm_page_t l2pg; vm_pindex_t l2pindex; retry: l1 = pmap_l1(pmap, va); if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { l2 = pmap_l1_to_l2(l1, va); if (va < VM_MAXUSER_ADDRESS) { /* Add a reference to the L2 page. */ l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK); l2pg->ref_count++; } else l2pg = NULL; } else if (va < VM_MAXUSER_ADDRESS) { /* Allocate a L2 page. */ l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); if (l2pg == NULL) { if (lockp != NULL) goto retry; else return (NULL); } l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); l2 = &l2[pmap_l2_index(va)]; } else panic("pmap_alloc_l2: missing page table page for va %#lx", va); *l2pgp = l2pg; return (l2); } static vm_page_t pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pde, tpde; #ifdef INVARIANTS pt_entry_t *pte; #endif vm_page_t m; int lvl; /* * Calculate pagetable page index */ ptepindex = pmap_l2_pindex(va); retry: /* * Get the page directory entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment the hold count, * and activate it. If we get a level 2 pde it will point to a level 3 * table. */ switch (lvl) { case -1: break; case 0: #ifdef INVARIANTS pte = pmap_l0_to_l1(pde, va); KASSERT(pmap_load(pte) == 0, ("pmap_alloc_l3: TODO: l0 superpages")); #endif break; case 1: #ifdef INVARIANTS pte = pmap_l1_to_l2(pde, va); KASSERT(pmap_load(pte) == 0, ("pmap_alloc_l3: TODO: l1 superpages")); #endif break; case 2: tpde = pmap_load(pde); if (tpde != 0) { m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK); m->ref_count++; return (m); } break; default: panic("pmap_alloc_l3: Invalid level %d", lvl); } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ m = _pmap_alloc_l3(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { struct asid_set *set; vm_page_t m; int asid; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); set = pmap->pm_asid_set; KASSERT(set != NULL, ("%s: NULL asid set", __func__)); /* * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate * the entries when removing them so rely on a later tlb invalidation. * this will happen when updating the VMID generation. Because of this * we don't reuse VMIDs within a generation. */ if (pmap->pm_stage == PM_STAGE1) { mtx_lock_spin(&set->asid_set_mutex); if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) { asid = COOKIE_TO_ASID(pmap->pm_cookie); KASSERT(asid >= ASID_FIRST_AVAILABLE && asid < set->asid_set_size, ("pmap_release: pmap cookie has out-of-range asid")); bit_clear(set->asid_set, asid); } mtx_unlock_spin(&set->asid_set_mutex); } m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); vm_page_unwire_noq(m); vm_page_free_zero(m); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *l0, *l1, *l2; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, L2_SIZE); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { l0 = pmap_l0(kernel_pmap, kernel_vm_end); KASSERT(pmap_load(l0) != 0, ("pmap_growkernel: No level 0 kernel entry")); l1 = pmap_l0_to_l1(l0, kernel_vm_end); if (pmap_load(l1) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); /* See the dmb() in _pmap_alloc_l3(). */ dmb(ishst); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_store(l1, paddr | L1_TABLE); continue; /* try again */ } l2 = pmap_l1_to_l2(l1, kernel_vm_end); if (pmap_load(l2) != 0) { kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); /* See the dmb() in _pmap_alloc_l3(). */ dmb(ishst); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_store(l2, paddr | L2_TABLE); kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #if 0 #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif #endif /* 0 */ /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { struct pv_chunk *pc, *pc_marker, *pc_marker_end; struct pv_chunk_header pc_marker_b, pc_marker_end_b; struct md_page *pvh; pd_entry_t *pde; pmap_t next_pmap, pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed, lvl; static int active_reclaims = 0; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); bzero(&pc_marker_b, sizeof(pc_marker_b)); bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); pc_marker = (struct pv_chunk *)&pc_marker_b; pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; mtx_lock(&pv_chunks_mutex); active_reclaims++; TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && SLIST_EMPTY(&free)) { next_pmap = pc->pc_pmap; if (next_pmap == NULL) { /* * The next chunk is a marker. However, it is * not our marker, so active_reclaims must be * > 1. Consequently, the next_chunk code * will not rotate the pv_chunks list. */ goto next_chunk; } mtx_unlock(&pv_chunks_mutex); /* * A pv_chunk can only be removed from the pc_lru list * when both pv_chunks_mutex is owned and the * corresponding pmap is locked. */ if (pmap != next_pmap) { if (pmap != NULL && pmap != locked_pmap) PMAP_UNLOCK(pmap); pmap = next_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); mtx_lock(&pv_chunks_mutex); continue; } else if (pmap != locked_pmap) { if (PMAP_TRYLOCK(pmap)) { mtx_lock(&pv_chunks_mutex); continue; } else { pmap = NULL; /* pmap is not locked */ mtx_lock(&pv_chunks_mutex); pc = TAILQ_NEXT(pc_marker, pc_lru); if (pc == NULL || pc->pc_pmap != next_pmap) continue; goto next_chunk; } } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = ffsl(inuse) - 1; pv = &pc->pc_pventry[field * 64 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va, &lvl); if (lvl != 2) continue; pte = pmap_l2_to_l3(pde, va); tpte = pmap_load(pte); if ((tpte & ATTR_SW_WIRED) != 0) continue; tpte = pmap_load_clear(pte); m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK); if (pmap_pte_dirty(pmap, tpte)) vm_page_dirty(m); if ((tpte & ATTR_AF) != 0) { pmap_invalidate_page(pmap, va); vm_page_aflag_set(m, PGA_REFERENCED); } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = page_to_pvh(m); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, pmap_load(pde), &free); freed++; } } if (freed == 0) { mtx_lock(&pv_chunks_mutex); goto next_chunk; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && pc->pc_map[2] == PC_FREE2) { PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); mtx_lock(&pv_chunks_mutex); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; next_chunk: TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); if (active_reclaims == 1 && pmap != NULL) { /* * Rotate the pv chunks list so that we do not * scan the same pv chunks that could not be * freed (because they contained a wired * and/or superpage mapping) on every * invocation of reclaim_pv_chunk(). */ while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { MPASS(pc->pc_pmap != NULL); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); } } } TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); active_reclaims--; mtx_unlock(&pv_chunks_mutex); if (pmap != NULL && pmap != locked_pmap) PMAP_UNLOCK(pmap); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->ref_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; vm_page_t m; int avail, free; bool reclaimed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((va & L2_OFFSET) == 0, ("pmap_pv_demote_l2: va is not 2mpage aligned")); KASSERT((pa & L2_OFFSET) == 0, ("pmap_pv_demote_l2: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); va_last = va + L2_SIZE - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_l2: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); } /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = l2e & ~ATTR_MASK; CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); } static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { pt_entry_t newl2, oldl2; vm_page_t ml3; vm_paddr_t ml3pa; KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); ml3 = pmap_remove_pt_page(pmap, va); if (ml3 == NULL) panic("pmap_remove_kernel_l2: Missing pt page"); ml3pa = VM_PAGE_TO_PHYS(ml3); newl2 = ml3pa | L2_TABLE; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (ml3->valid != 0) pagezero((void *)PHYS_TO_DMAP(ml3pa)); /* * Demote the mapping. The caller must have already invalidated the * mapping (i.e., the "break" in break-before-make). */ oldl2 = pmap_load_store(l2, newl2); KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", __func__, l2, oldl2)); } /* * pmap_remove_l2: Do the things to unmap a level 2 superpage. */ static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l2; vm_offset_t eva, va; vm_page_t m, ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); old_l2 = pmap_load_clear(l2); KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2)); /* * Since a promotion must break the 4KB page mappings before making * the 2MB page mapping, a pmap_invalidate_page() suffices. */ pmap_invalidate_page(pmap, sva); if (old_l2 & ATTR_SW_WIRED) pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); if (old_l2 & ATTR_SW_MANAGED) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK); pvh = pa_to_pvh(old_l2 & ~ATTR_MASK); pmap_pvh_free(pvh, pmap, sva); eva = sva + L2_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); va < eva; va += PAGE_SIZE, m++) { if (pmap_pte_dirty(pmap, old_l2)) vm_page_dirty(m); if (old_l2 & ATTR_AF) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_l2(pmap, l2, sva); } else { ml3 = pmap_remove_pt_page(pmap, sva); if (ml3 != NULL) { KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_l2: l3 page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(ml3->ref_count == NL3PG, ("pmap_remove_l2: l3 page ref count error")); ml3->ref_count = 0; pmap_add_delayed_free_list(ml3, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, l1e, free)); } /* * pmap_remove_l3: do the things to unmap a page in a process */ static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l3; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); old_l3 = pmap_load_clear(l3); pmap_invalidate_page(pmap, va); if (old_l3 & ATTR_SW_WIRED) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (old_l3 & ATTR_SW_MANAGED) { m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); if (pmap_pte_dirty(pmap, old_l3)) vm_page_dirty(m); if (old_l3 & ATTR_AF) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = page_to_pvh(m); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } return (pmap_unuse_pt(pmap, va, l2e, free)); } /* * Remove the specified range of addresses from the L3 page table that is * identified by the given L2 entry. */ static void pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva, vm_offset_t eva, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; struct rwlock *new_lock; pt_entry_t *l3, old_l3; vm_offset_t va; vm_page_t l3pg, m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE), ("pmap_remove_l3_range: range crosses an L3 page table boundary")); l3pg = sva < VM_MAXUSER_ADDRESS ? PHYS_TO_VM_PAGE(l2e & ~ATTR_MASK) : NULL; va = eva; for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) { if (!pmap_l3_valid(pmap_load(l3))) { if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } continue; } old_l3 = pmap_load_clear(l3); if ((old_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count--; pmap_resident_count_dec(pmap, 1); if ((old_l3 & ATTR_SW_MANAGED) != 0) { m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); if (pmap_pte_dirty(pmap, old_l3)) vm_page_dirty(m); if ((old_l3 & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)); if (new_lock != *lockp) { if (*lockp != NULL) { /* * Pending TLB invalidations must be * performed before the PV list lock is * released. Otherwise, a concurrent * pmap_remove_all() on a physical page * could return while a stale TLB entry * still provides access to that page. */ if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } rw_wunlock(*lockp); } *lockp = new_lock; rw_wlock(*lockp); } pmap_pvh_free(&m->md, pmap, sva); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = page_to_pvh(m); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (va == eva) va = sva; if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) { sva += L3_SIZE; break; } } if (va != eva) pmap_invalidate_range(pmap, va, sva); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t l3_paddr; struct spglist free; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; SLIST_INIT(&free); PMAP_LOCK(pmap); lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) continue; if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { KASSERT(va_next <= eva, ("partial update of non-transparent 1G page " "l1 %#lx sva %#lx eva %#lx va_next %#lx", pmap_load(l1), sva, eva, va_next)); MPASS(pmap != kernel_pmap); MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); pmap_clear(l1); pmap_invalidate_page(pmap, sva); pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE); pmap_unuse_pt(pmap, sva, pmap_load(l0), &free); continue; } /* * Calculate index for next page table. */ va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL) continue; l3_paddr = pmap_load(l2); if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_remove_l2(pmap, l2, sva, pmap_load(l1), &free, &lock); continue; } else if (pmap_demote_l2_locked(pmap, l2, sva, &lock) == NULL) continue; l3_paddr = pmap_load(l2); } /* * Weed out invalid mappings. */ if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free, &lock); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; vm_offset_t va; struct spglist free; int lvl, pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); retry: rw_wlock(lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pte = pmap_pte(pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_remove_all: no page table entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pte level %d", lvl)); pmap_demote_l2_locked(pmap, pte, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_ASSERT_STAGE1(pmap); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } pmap_resident_count_dec(pmap, 1); pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_remove_all: no page directory entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pde level %d", lvl)); tpde = pmap_load(pde); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load_clear(pte); if (tpte & ATTR_SW_WIRED) pmap->pm_stats.wired_count--; if ((tpte & ATTR_AF) != 0) { pmap_invalidate_page(pmap, pv->pv_va); vm_page_aflag_set(m, PGA_REFERENCED); } /* * Update the vm_page_t clean and reference bits. */ if (pmap_pte_dirty(pmap, tpte)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); vm_page_free_pages_toq(&free, true); } /* * pmap_protect_l2: do the things to protect a 2MB page in a pmap */ static void pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, pt_entry_t nbits) { pd_entry_t old_l2; vm_page_t m, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PMAP_ASSERT_STAGE1(pmap); KASSERT((sva & L2_OFFSET) == 0, ("pmap_protect_l2: sva is not 2mpage aligned")); old_l2 = pmap_load(l2); KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2)); /* * Return if the L2 entry already has the desired access restrictions * in place. */ retry: if ((old_l2 & mask) == nbits) return; /* * When a dirty read/write superpage mapping is write protected, * update the dirty field of each of the superpage's constituent 4KB * pages. */ if ((old_l2 & ATTR_SW_MANAGED) != 0 && (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && pmap_pte_dirty(pmap, old_l2)) { m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); } if (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) goto retry; /* * Since a promotion must break the 4KB page mappings before making * the 2MB page mapping, a pmap_invalidate_page() suffices. */ pmap_invalidate_page(pmap, sva); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va, va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3p, l3, mask, nbits; PMAP_ASSERT_STAGE1(pmap); KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } mask = nbits = 0; if ((prot & VM_PROT_WRITE) == 0) { mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM; nbits |= ATTR_S1_AP(ATTR_S1_AP_RO); } if ((prot & VM_PROT_EXECUTE) == 0) { mask |= ATTR_S1_XN; nbits |= ATTR_S1_XN; } if (mask == 0) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) continue; if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { KASSERT(va_next <= eva, ("partial update of non-transparent 1G page " "l1 %#lx sva %#lx eva %#lx va_next %#lx", pmap_load(l1), sva, eva, va_next)); MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); if ((pmap_load(l1) & mask) != nbits) { pmap_store(l1, (pmap_load(l1) & ~mask) | nbits); pmap_invalidate_page(pmap, sva); } continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_protect_l2(pmap, l2, sva, mask, nbits); continue; } else if (pmap_demote_l2(pmap, l2, sva) == NULL) continue; } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_protect: Invalid L2 entry after demotion")); if (va_next > eva) va_next = eva; va = va_next; for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, sva += L3_SIZE) { l3 = pmap_load(l3p); retry: /* * Go to the next L3 entry if the current one is * invalid or already has the desired access * restrictions in place. (The latter case occurs * frequently. For example, in a "buildworld" * workload, almost 1 out of 4 L3 entries already * have the desired restrictions.) */ if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } /* * When a dirty read/write mapping is write protected, * update the page's dirty field. */ if ((l3 & ATTR_SW_MANAGED) != 0 && (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && pmap_pte_dirty(pmap, l3)) vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK)); if (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | nbits)) goto retry; if (va == va_next) va = sva; } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); } /* * Performs a break-before-make update of a pmap entry. This is needed when * either promoting or demoting pages to ensure the TLB doesn't get into an * inconsistent state. */ static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, vm_offset_t va, vm_size_t size) { register_t intr; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Ensure we don't get switched out with the page table in an * inconsistent state. We also need to ensure no interrupts fire * as they may make use of an address we are about to invalidate. */ intr = intr_disable(); /* * Clear the old mapping's valid bit, but leave the rest of the entry * unchanged, so that a lockless, concurrent pmap_kextract() can still * lookup the physical address. */ pmap_clear_bits(pte, ATTR_DESCR_VALID); pmap_invalidate_range(pmap, va, va + size); /* Create the new mapping */ pmap_store(pte, newpte); dsb(ishst); intr_restore(intr); } #if VM_NRESERVLEVEL > 0 /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & L2_OFFSET) == 0, ("pmap_pv_promote_l2: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = va & ~L2_OFFSET; pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + L2_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single level 2 table entry to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static void pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *firstl3, *l3, newl2, oldl3, pa; vm_page_t mpte; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PMAP_ASSERT_STAGE1(pmap); sva = va & ~L2_OFFSET; firstl3 = pmap_l2_to_l3(l2, sva); newl2 = pmap_load(firstl3); setl2: if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { if (!atomic_fcmpset_64(l2, &newl2, newl2 & ~ATTR_SW_DBM)) goto setl2; newl2 &= ~ATTR_SW_DBM; } pa = newl2 + L2_SIZE - PAGE_SIZE; for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { oldl3 = pmap_load(l3); setl3: if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & ~ATTR_SW_DBM)) goto setl3; oldl3 &= ~ATTR_SW_DBM; } if (oldl3 != pa) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the L2 * mapping the superpage is demoted by pmap_demote_l2() or * destroyed by pmap_remove_l3(). */ mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_l2: page table page is out of range")); KASSERT(mpte->pindex == pmap_l2_pindex(va), ("pmap_promote_l2: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte, true)) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx in pmap %p", va, pmap); return; } if ((newl2 & ATTR_SW_MANAGED) != 0) pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp); newl2 &= ~ATTR_DESCR_MASK; newl2 |= L2_BLOCK; pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE); atomic_add_long(&pmap_l2_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, pmap); } #endif /* VM_NRESERVLEVEL > 0 */ static int pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, int psind) { pd_entry_t *l0p, *l1p, *l2p, origpte; vm_page_t mp; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(psind > 0 && psind < MAXPAGESIZES, ("psind %d unexpected", psind)); KASSERT(((newpte & ~ATTR_MASK) & (pagesizes[psind] - 1)) == 0, ("unaligned phys address %#lx newpte %#lx psind %d", (newpte & ~ATTR_MASK), newpte, psind)); restart: if (psind == 2) { l0p = pmap_l0(pmap, va); if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) { mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL); if (mp == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) != 0) return (KERN_RESOURCE_SHORTAGE); PMAP_UNLOCK(pmap); vm_wait(NULL); PMAP_LOCK(pmap); goto restart; } l1p = pmap_l0_to_l1(l0p, va); KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); origpte = pmap_load(l1p); } else { l1p = pmap_l0_to_l1(l0p, va); KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); origpte = pmap_load(l1p); if ((origpte & ATTR_DESCR_VALID) == 0) { mp = PHYS_TO_VM_PAGE(pmap_load(l0p) & ~ATTR_MASK); mp->ref_count++; } } KASSERT((origpte & ATTR_DESCR_VALID) == 0 || ((origpte & ATTR_DESCR_MASK) == L1_BLOCK && (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)), ("va %#lx changing 1G phys page l1 %#lx newpte %#lx", va, origpte, newpte)); pmap_store(l1p, newpte); } else /* (psind == 1) */ { l2p = pmap_l2(pmap, va); if (l2p == NULL) { mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL); if (mp == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) != 0) return (KERN_RESOURCE_SHORTAGE); PMAP_UNLOCK(pmap); vm_wait(NULL); PMAP_LOCK(pmap); goto restart; } l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); l2p = &l2p[pmap_l2_index(va)]; origpte = pmap_load(l2p); } else { l1p = pmap_l1(pmap, va); origpte = pmap_load(l2p); if ((origpte & ATTR_DESCR_VALID) == 0) { mp = PHYS_TO_VM_PAGE(pmap_load(l1p) & ~ATTR_MASK); mp->ref_count++; } } KASSERT((origpte & ATTR_DESCR_VALID) == 0 || ((origpte & ATTR_DESCR_MASK) == L2_BLOCK && (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)), ("va %#lx changing 2M phys page l2 %#lx newpte %#lx", va, origpte, newpte)); pmap_store(l2p, newpte); } dsb(ishst); if ((origpte & ATTR_DESCR_VALID) == 0) pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE); if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0) pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; else if ((newpte & ATTR_SW_WIRED) == 0 && (origpte & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; return (KERN_SUCCESS); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pd_entry_t *pde; pt_entry_t new_l3, orig_l3; pt_entry_t *l2, *l3; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; boolean_t nosleep; int lvl, rv; va = trunc_page(va); if ((m->oflags & VPO_UNMANAGED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); pa = VM_PAGE_TO_PHYS(m); new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | L3_PAGE); new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr); new_l3 |= pmap_pte_prot(pmap, prot); if ((flags & PMAP_ENTER_WIRED) != 0) new_l3 |= ATTR_SW_WIRED; if (pmap->pm_stage == PM_STAGE1) { if (va < VM_MAXUSER_ADDRESS) new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; else new_l3 |= ATTR_S1_UXN; if (pmap != kernel_pmap) new_l3 |= ATTR_S1_nG; } else { /* * Clear the access flag on executable mappings, this will be * set later when the page is accessed. The fault handler is * required to invalidate the I-cache. * * TODO: Switch to the valid flag to allow hardware management * of the access flag. Much of the pmap code assumes the * valid flag is set and fails to destroy the old page tables * correctly if it is clear. */ if (prot & VM_PROT_EXECUTE) new_l3 &= ~ATTR_AF; } if ((m->oflags & VPO_UNMANAGED) == 0) { new_l3 |= ATTR_SW_MANAGED; if ((prot & VM_PROT_WRITE) != 0) { new_l3 |= ATTR_SW_DBM; if ((flags & VM_PROT_WRITE) == 0) { if (pmap->pm_stage == PM_STAGE1) new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); else new_l3 &= ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); } } } CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); lock = NULL; PMAP_LOCK(pmap); if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed largepage va %#lx flags %#x", va, flags)); new_l3 &= ~L3_PAGE; if (psind == 2) new_l3 |= L1_BLOCK; else /* (psind == 1) */ new_l3 |= L2_BLOCK; rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind); goto out; } if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, flags, m, &lock); goto out; } mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ retry: pde = pmap_pde(pmap, va, &lvl); if (pde != NULL && lvl == 2) { l3 = pmap_l2_to_l3(pde, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); mpte->ref_count++; } goto havel3; } else if (pde != NULL && lvl == 1) { l2 = pmap_l1_to_l2(pde, va); if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) { l3 = &l3[pmap_l3_index(va)]; if (va < VM_MAXUSER_ADDRESS) { mpte = PHYS_TO_VM_PAGE( pmap_load(l2) & ~ATTR_MASK); mpte->ref_count++; } goto havel3; } /* We need to allocate an L3 table. */ } if (va < VM_MAXUSER_ADDRESS) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; /* * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order * to handle the possibility that a superpage mapping for "va" * was created while we slept. */ mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); rv = KERN_RESOURCE_SHORTAGE; goto out; } goto retry; } else panic("pmap_enter: missing L3 table for kernel va %#lx", va); havel3: orig_l3 = pmap_load(l3); opa = orig_l3 & ~ATTR_MASK; pv = NULL; /* * Is the specified virtual address already mapped? */ if (pmap_l3_valid(orig_l3)) { /* * Only allow adding new entries on stage 2 tables for now. * This simplifies cache invalidation as we may need to call * into EL2 to perform such actions. */ PMAP_ASSERT_STAGE1(pmap); /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((flags & PMAP_ENTER_WIRED) != 0 && (orig_l3 & ATTR_SW_WIRED) == 0) pmap->pm_stats.wired_count++; else if ((flags & PMAP_ENTER_WIRED) == 0 && (orig_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->ref_count--; KASSERT(mpte->ref_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((orig_l3 & ATTR_SW_MANAGED) != 0 && (new_l3 & ATTR_SW_DBM) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. */ orig_l3 = pmap_load_clear(l3); KASSERT((orig_l3 & ~ATTR_MASK) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((orig_l3 & ATTR_SW_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if (pmap_pte_dirty(pmap, orig_l3)) vm_page_dirty(om); if ((orig_l3 & ATTR_AF) != 0) { pmap_invalidate_page(pmap, va); vm_page_aflag_set(om, PGA_REFERENCED); } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); if ((m->oflags & VPO_UNMANAGED) != 0) free_pv_entry(pmap, pv); if ((om->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } else { KASSERT((orig_l3 & ATTR_AF) != 0, ("pmap_enter: unmanaged mapping lacks ATTR_AF")); pmap_invalidate_page(pmap, va); } orig_l3 = 0; } else { /* * Increment the counters. */ if ((new_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((new_l3 & ATTR_SW_DBM) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } validate: if (pmap->pm_stage == PM_STAGE1) { /* * Sync icache if exec permission and attribute * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping * is stored and made valid for hardware table walk. If done * later, then other can access this page before caches are * properly synced. Don't do it for kernel memory which is * mapped with exec permission even if the memory isn't going * to hold executable code. The only time when icache sync is * needed is after kernel module is loaded and the relocation * info is processed. And it's done in elf_cpu_load_file(). */ if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK && (opa != pa || (orig_l3 & ATTR_S1_XN))) { PMAP_ASSERT_STAGE1(pmap); cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); } } else { cpu_dcache_wb_range(PHYS_TO_DMAP(pa), PAGE_SIZE); } /* * Update the L3 entry */ if (pmap_l3_valid(orig_l3)) { PMAP_ASSERT_STAGE1(pmap); KASSERT(opa == pa, ("pmap_enter: invalid update")); if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { /* same PA, different attributes */ orig_l3 = pmap_load_store(l3, new_l3); pmap_invalidate_page(pmap, va); if ((orig_l3 & ATTR_SW_MANAGED) != 0 && pmap_pte_dirty(pmap, orig_l3)) vm_page_dirty(m); } else { /* * orig_l3 == new_l3 * This can happens if multiple threads simultaneously * access not yet mapped page. This bad for performance * since this can cause full demotion-NOP-promotion * cycle. * Another possible reasons are: * - VM and pmap memory layout are diverged * - tlb flush is missing somewhere and CPU doesn't see * actual mapping. */ CTR4(KTR_PMAP, "%s: already mapped page - " "pmap %p va 0x%#lx pte 0x%lx", __func__, pmap, va, new_l3); } } else { /* New mapping */ pmap_store(l3, new_l3); dsb(ishst); } #if VM_NRESERVLEVEL > 0 /* * Try to promote from level 3 pages to a level 2 superpage. This * currently only works on stage 1 pmaps as pmap_promote_l2 looks at * stage 1 specific fields and performs a break-before-make sequence * that is incorrect a stage 2 pmap. */ if ((mpte == NULL || mpte->ref_count == NL3PG) && pmap_ps_enabled(pmap) && pmap->pm_stage == PM_STAGE1 && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { pmap_promote_l2(pmap, pde, va, &lock); } #endif rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t new_l2; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PMAP_ASSERT_STAGE1(pmap); new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | L2_BLOCK); if ((m->oflags & VPO_UNMANAGED) == 0) { new_l2 |= ATTR_SW_MANAGED; new_l2 &= ~ATTR_AF; } if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == VM_MEMATTR_DEVICE) new_l2 |= ATTR_S1_XN; if (va < VM_MAXUSER_ADDRESS) new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; else new_l2 |= ATTR_S1_UXN; if (pmap != kernel_pmap) new_l2 |= ATTR_S1_nG; return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Returns true if every page table entry in the specified page table is * zero. */ static bool pmap_every_pte_zero(vm_paddr_t pa) { pt_entry_t *pt_end, *pte; KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); pte = (pt_entry_t *)PHYS_TO_DMAP(pa); for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { if (*pte != 0) return (false); } return (true); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pd_entry_t *l2, old_l2; vm_page_t l2pg, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } /* * If there are existing mappings, either abort or remove them. */ if ((old_l2 = pmap_load(l2)) != 0) { KASSERT(l2pg == NULL || l2pg->ref_count > 1, ("pmap_enter_l2: l2pg's ref count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0 && (va < VM_MAXUSER_ADDRESS || (old_l2 & ATTR_DESCR_MASK) == L2_BLOCK || !pmap_every_pte_zero(old_l2 & ~ATTR_MASK))) { if (l2pg != NULL) l2pg->ref_count--; CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_FAILURE); } SLIST_INIT(&free); if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, lockp); else pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, &free, lockp); if (va < VM_MAXUSER_ADDRESS) { vm_page_free_pages_toq(&free, true); KASSERT(pmap_load(l2) == 0, ("pmap_enter_l2: non-zero L2 entry %p", l2)); } else { KASSERT(SLIST_EMPTY(&free), ("pmap_enter_l2: freed kernel page table page")); /* * Both pmap_remove_l2() and pmap_remove_l3_range() * will leave the kernel page table page zero filled. * Nonetheless, the TLB could have an intermediate * entry for the kernel page table page. */ mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_l2: trie insert failed"); pmap_clear(l2); pmap_invalidate_page(pmap, va); } } if ((new_l2 & ATTR_SW_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { if (l2pg != NULL) pmap_abort_ptp(pmap, va, l2pg); CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((new_l2 & ATTR_SW_DBM) != 0) for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } /* * Increment counters. */ if ((new_l2 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; /* * Map the superpage. */ pmap_store(l2, new_l2); dsb(ishst); atomic_add_long(&pmap_l2_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[L2_SIZE / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { pd_entry_t *pde; pt_entry_t *l2, *l3, l3_val; vm_paddr_t pa; int lvl; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PMAP_ASSERT_STAGE1(pmap); CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t l2pindex; /* * Calculate pagetable page index */ l2pindex = pmap_l2_pindex(va); if (mpte && (mpte->pindex == l2pindex)) { mpte->ref_count++; } else { /* * Get the l2 entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (lvl == 1) { l2 = pmap_l1_to_l2(pde, va); if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) return (NULL); } if (lvl == 2 && pmap_load(pde) != 0) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); mpte->ref_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); if (mpte == NULL) return (mpte); } } l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); l3 = &l3[pmap_l3_index(va)]; } else { mpte = NULL; pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_enter_quick_locked: Invalid level %d", lvl)); l3 = pmap_l2_to_l3(pde, va); } /* * Abort if a mapping already exists. */ if (pmap_load(l3) != 0) { if (mpte != NULL) mpte->ref_count--; return (NULL); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) pmap_abort_ptp(pmap, va, mpte); return (NULL); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); pa = VM_PAGE_TO_PHYS(m); l3_val = pa | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE; if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == VM_MEMATTR_DEVICE) l3_val |= ATTR_S1_XN; if (va < VM_MAXUSER_ADDRESS) l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; else l3_val |= ATTR_S1_UXN; if (pmap != kernel_pmap) l3_val |= ATTR_S1_nG; /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) == 0) { l3_val |= ATTR_SW_MANAGED; l3_val &= ~ATTR_AF; } /* Sync icache before the mapping is stored to PTE */ if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); pmap_store(l3, l3_val); dsb(ishst); return (mpte); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; if (pmap_load(l1) == 0) continue; if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { KASSERT(va_next <= eva, ("partial update of non-transparent 1G page " "l1 %#lx sva %#lx eva %#lx va_next %#lx", pmap_load(l1), sva, eva, va_next)); MPASS(pmap != kernel_pmap); MPASS((pmap_load(l1) & (ATTR_SW_MANAGED | ATTR_SW_WIRED)) == ATTR_SW_WIRED); pmap_clear_bits(l1, ATTR_SW_WIRED); pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { if ((pmap_load(l2) & ATTR_SW_WIRED) == 0) panic("pmap_unwire: l2 %#jx is missing " "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2)); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_clear_bits(l2, ATTR_SW_WIRED); pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; continue; } else if (pmap_demote_l2(pmap, l2, sva) == NULL) panic("pmap_unwire: demotion failed"); } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_unwire: Invalid l2 entry after demotion")); if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (pmap_load(l3) == 0) continue; if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) panic("pmap_unwire: l3 %#jx is missing " "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); /* * ATTR_SW_WIRED must be cleared atomically. Although * the pmap lock synchronizes access to ATTR_SW_WIRED, * the System MMU may write to the entry concurrently. */ pmap_clear_bits(l3, ATTR_SW_WIRED); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. * * Because the executable mappings created by this routine are copied, * it should not have to flush the instruction cache. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct rwlock *lock; pd_entry_t *l0, *l1, *l2, srcptepaddr; pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte; vm_offset_t addr, end_addr, va_next; vm_page_t dst_m, dstmpte, srcmpte; PMAP_ASSERT_STAGE1(dst_pmap); PMAP_ASSERT_STAGE1(src_pmap); if (dst_addr != src_addr) return; end_addr = src_addr + len; lock = NULL; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } for (addr = src_addr; addr < end_addr; addr = va_next) { l0 = pmap_l0(src_pmap, addr); if (pmap_load(l0) == 0) { va_next = (addr + L0_SIZE) & ~L0_OFFSET; if (va_next < addr) va_next = end_addr; continue; } va_next = (addr + L1_SIZE) & ~L1_OFFSET; if (va_next < addr) va_next = end_addr; l1 = pmap_l0_to_l1(l0, addr); if (pmap_load(l1) == 0) continue; if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { KASSERT(va_next <= end_addr, ("partial update of non-transparent 1G page " "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", pmap_load(l1), addr, end_addr, va_next)); srcptepaddr = pmap_load(l1); l1 = pmap_l1(dst_pmap, addr); if (l1 == NULL) { if (_pmap_alloc_l3(dst_pmap, pmap_l0_pindex(addr), NULL) == NULL) break; l1 = pmap_l1(dst_pmap, addr); } else { l0 = pmap_l0(dst_pmap, addr); dst_m = PHYS_TO_VM_PAGE(pmap_load(l0) & ~ATTR_MASK); dst_m->ref_count++; } KASSERT(pmap_load(l1) == 0, ("1G mapping present in dst pmap " "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", pmap_load(l1), addr, end_addr, va_next)); pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED); pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE); continue; } va_next = (addr + L2_SIZE) & ~L2_OFFSET; if (va_next < addr) va_next = end_addr; l2 = pmap_l1_to_l2(l1, addr); srcptepaddr = pmap_load(l2); if (srcptepaddr == 0) continue; if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) { if ((addr & L2_OFFSET) != 0 || addr + L2_SIZE > end_addr) continue; l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL); if (l2 == NULL) break; if (pmap_load(l2) == 0 && ((srcptepaddr & ATTR_SW_MANAGED) == 0 || pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM, &lock))) { mask = ATTR_AF | ATTR_SW_WIRED; nbits = 0; if ((srcptepaddr & ATTR_SW_DBM) != 0) nbits |= ATTR_S1_AP_RW_BIT; pmap_store(l2, (srcptepaddr & ~mask) | nbits); pmap_resident_count_inc(dst_pmap, L2_SIZE / PAGE_SIZE); atomic_add_long(&pmap_l2_mappings, 1); } else pmap_abort_ptp(dst_pmap, addr, dst_m); continue; } KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_copy: invalid L2 entry")); srcptepaddr &= ~ATTR_MASK; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); KASSERT(srcmpte->ref_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); src_pte = &src_pte[pmap_l3_index(addr)]; dstmpte = NULL; for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { ptetemp = pmap_load(src_pte); /* * We only virtual copy managed pages. */ if ((ptetemp & ATTR_SW_MANAGED) == 0) continue; if (dstmpte != NULL) { KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), ("dstmpte pindex/addr mismatch")); dstmpte->ref_count++; } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, NULL)) == NULL) goto out; dst_pte = (pt_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); dst_pte = &dst_pte[pmap_l3_index(addr)]; if (pmap_load(dst_pte) == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) { /* * Clear the wired, modified, and accessed * (referenced) bits during the copy. */ mask = ATTR_AF | ATTR_SW_WIRED; nbits = 0; if ((ptetemp & ATTR_SW_DBM) != 0) nbits |= ATTR_S1_AP_RW_BIT; pmap_store(dst_pte, (ptetemp & ~mask) | nbits); pmap_resident_count_inc(dst_pmap, 1); } else { pmap_abort_ptp(dst_pmap, addr, dstmpte); goto out; } /* Have we copied all of the valid mappings? */ if (dstmpte->ref_count >= srcmpte->ref_count) break; } } out: /* * XXX This barrier may not be needed because the destination pmap is * not active. */ dsb(ishst); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t m_a, m_b; vm_paddr_t p_a, p_b; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; m_a = ma[a_offset >> PAGE_SHIFT]; p_a = m_a->phys_addr; b_pg_offset = b_offset & PAGE_MASK; m_b = mb[b_offset >> PAGE_SHIFT]; p_b = m_b->phys_addr; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); if (__predict_false(!PHYS_IN_DMAP(p_a))) { panic("!DMAP a %lx", p_a); } else { a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; } if (__predict_false(!PHYS_IN_DMAP(p_b))) { panic("!DMAP b %lx", p_b); } else { b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; } bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } void pmap_quick_remove_page(vm_offset_t addr) { } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = page_to_pvh(m); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, lvl, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = page_to_pvh(m); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } /* * Returns true if the given page is mapped individually or as part of * a 2mpage. Otherwise, returns false. */ bool pmap_page_is_mapped(vm_page_t m) { struct rwlock *lock; bool rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (false); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&page_to_pvh(m)->pv_list)); rw_runlock(lock); return (rv); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t *pde; pt_entry_t *pte, tpte; struct spglist free; vm_page_t m, ml3, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx, lvl; vm_paddr_t pa; KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); lock = NULL; SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("Attempting to remove an unmapped page")); switch(lvl) { case 1: pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("Attempting to remove an invalid " "block: %lx", tpte)); break; case 2: pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("Attempting to remove an invalid " "page: %lx", tpte)); break; default: panic( "Invalid page directory level: %d", lvl); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & ATTR_SW_WIRED) { allfree = 0; continue; } pa = tpte & ~ATTR_MASK; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad pte %#jx", (uintmax_t)tpte)); /* * Because this pmap is not active on other * processors, the dirty bit cannot have * changed state since we last loaded pte. */ pmap_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if (pmap_pte_dirty(pmap, tpte)) { switch (lvl) { case 1: for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); break; case 2: vm_page_dirty(m); break; } } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; switch (lvl) { case 1: pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); pvh = pa_to_pvh(tpte & ~ATTR_MASK); TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) if ((mt->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } ml3 = pmap_remove_pt_page(pmap, pv->pv_va); if (ml3 != NULL) { KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: l3 page not promoted")); pmap_resident_count_dec(pmap,1); KASSERT(ml3->ref_count == NL3PG, ("pmap_remove_pages: l3 page ref count error")); ml3->ref_count = 0; pmap_add_delayed_free_list(ml3, &free, FALSE); } break; case 2: pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((m->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = page_to_pvh(m); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } break; } pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * This is used to check if a page has been accessed or modified. */ static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask, value; pmap_t pmap; int lvl, md_gen, pvh_gen; boolean_t rv; rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_ASSERT_STAGE1(pmap); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 3, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_S1_AP_RW_BIT; value |= ATTR_S1_AP(ATTR_S1_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L3_PAGE; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = page_to_pvh(m); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_ASSERT_STAGE1(pmap); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 2, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_S1_AP_RW_BIT; value |= ATTR_S1_AP(ATTR_S1_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L2_BLOCK; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *pte; boolean_t rv; int lvl; rv = FALSE; PMAP_LOCK(pmap); pte = pmap_pte(pmap, addr, &lvl); if (pte != NULL && pmap_load(pte) != 0) { rv = TRUE; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pt_entry_t oldpte, *pte; vm_offset_t va; int lvl, md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); PMAP_ASSERT_STAGE1(pmap); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } va = pv->pv_va; pte = pmap_pte(pmap, pv->pv_va, &lvl); if ((pmap_load(pte) & ATTR_SW_DBM) != 0) (void)pmap_demote_l2_locked(pmap, pte, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_ASSERT_STAGE1(pmap); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); oldpte = pmap_load(pte); retry: if ((oldpte & ATTR_SW_DBM) != 0) { if (!atomic_fcmpset_long(pte, &oldpte, (oldpte | ATTR_S1_AP_RW_BIT) & ~ATTR_SW_DBM)) goto retry; if ((oldpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; vm_offset_t va; vm_paddr_t pa; int cleared, lvl, md_gen, not_cleared, pvh_gen; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found")); KASSERT(lvl == 1, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE, ("pmap_ts_referenced: found an invalid l1 table")); pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); if (pmap_pte_dirty(pmap, tpte)) { /* * Although "tpte" is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((tpte & ATTR_AF) != 0) { /* * Since this reference bit is shared by 512 4KB pages, * it should not be cleared every time it is tested. * Apply a simple "hash" function on the physical page * number, the virtual superpage number, and the pmap * address to select one 4KB page out of the 512 on * which testing the reference bit will result in * clearing that reference bit. This function is * designed to avoid the selection of the same 4KB page * for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && (tpte & ATTR_SW_WIRED) == 0) { pmap_clear_bits(pte, ATTR_AF); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found")); KASSERT(lvl == 2, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_ts_referenced: found an invalid l2 table")); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); if (pmap_pte_dirty(pmap, tpte)) vm_page_dirty(m); if ((tpte & ATTR_AF) != 0) { if ((tpte & ATTR_SW_WIRED) == 0) { pmap_clear_bits(pte, ATTR_AF); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); vm_page_free_pages_toq(&free, true); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { struct rwlock *lock; vm_offset_t va, va_next; vm_page_t m; pd_entry_t *l0, *l1, *l2, oldl2; pt_entry_t *l3, oldl3; PMAP_ASSERT_STAGE1(pmap); if (advice != MADV_DONTNEED && advice != MADV_FREE) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) continue; if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { KASSERT(va_next <= eva, ("partial update of non-transparent 1G page " "l1 %#lx sva %#lx eva %#lx va_next %#lx", pmap_load(l1), sva, eva, va_next)); continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); oldl2 = pmap_load(l2); if (oldl2 == 0) continue; if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) { if ((oldl2 & ATTR_SW_MANAGED) == 0) continue; lock = NULL; if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { if (lock != NULL) rw_wunlock(lock); /* * The 2MB page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Choosing the last page * within the address range [sva, min(va_next, eva)) * generally results in more repromotions. Since the * underlying page table page is fully populated, this * removal never frees a page table page. */ if ((oldl2 & ATTR_SW_WIRED) == 0) { va = eva; if (va > va_next) va = va_next; va -= PAGE_SIZE; KASSERT(va >= sva, ("pmap_advise: no address gap")); l3 = pmap_l2_to_l3(l2, va); KASSERT(pmap_load(l3) != 0, ("pmap_advise: invalid PTE")); pmap_remove_l3(pmap, l3, va, pmap_load(l2), NULL, &lock); } if (lock != NULL) rw_wunlock(lock); } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_advise: invalid L2 entry after demotion")); if (va_next > eva) va_next = eva; va = va_next; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { oldl3 = pmap_load(l3); if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) != (ATTR_SW_MANAGED | L3_PAGE)) goto maybe_invlrng; else if (pmap_pte_dirty(pmap, oldl3)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK); vm_page_dirty(m); } while (!atomic_fcmpset_long(l3, &oldl3, (oldl3 & ~ATTR_AF) | ATTR_S1_AP(ATTR_S1_AP_RO))) cpu_spinwait(); } else if ((oldl3 & ATTR_AF) != 0) pmap_clear_bits(l3, ATTR_AF); else goto maybe_invlrng; if (va == va_next) va = sva; continue; maybe_invlrng: if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t *l2, oldl2; pt_entry_t *l3, oldl3; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); PMAP_ASSERT_STAGE1(pmap); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); oldl2 = pmap_load(l2); /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */ if ((oldl2 & ATTR_SW_DBM) != 0 && pmap_demote_l2_locked(pmap, l2, va, &lock) && (oldl2 & ATTR_SW_WIRED) == 0) { /* * Write protect the mapping to a single page so that * a subsequent write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK); l3 = pmap_l2_to_l3(l2, va); oldl3 = pmap_load(l3); while (!atomic_fcmpset_long(l3, &oldl3, (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO))) cpu_spinwait(); vm_page_dirty(m); pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_ASSERT_STAGE1(pmap); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); l3 = pmap_l2_to_l3(l2, pv->pv_va); oldl3 = pmap_load(l3); if (pmap_l3_valid(oldl3) && (oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM){ pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO)); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; pd_entry_t *pde; pt_entry_t *l2; int i, lvl, l2_blocks, free_l2_count, start_idx; if (!vm_initialized) { /* * No L3 ptables so map entire L2 blocks where start VA is: * preinit_map_va + start_idx * L2_SIZE * There may be duplicate mappings (multiple VA -> same PA) but * ARM64 dcache is always PIPT so that's acceptable. */ if (size == 0) return (NULL); /* Calculate how many L2 blocks are needed for the mapping */ l2_blocks = (roundup2(pa + size, L2_SIZE) - rounddown2(pa, L2_SIZE)) >> L2_SHIFT; offset = pa & L2_OFFSET; if (preinit_map_va == 0) return (NULL); /* Map 2MiB L2 blocks from reserved VA space */ free_l2_count = 0; start_idx = -1; /* Find enough free contiguous VA space */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (free_l2_count > 0 && ppim->pa != 0) { /* Not enough space here */ free_l2_count = 0; start_idx = -1; continue; } if (ppim->pa == 0) { /* Free L2 block */ if (start_idx == -1) start_idx = i; free_l2_count++; if (free_l2_count == l2_blocks) break; } } if (free_l2_count != l2_blocks) panic("%s: too many preinit mappings", __func__); va = preinit_map_va + (start_idx * L2_SIZE); for (i = start_idx; i < start_idx + l2_blocks; i++) { /* Mark entries as allocated */ ppim = pmap_preinit_mapping + i; ppim->pa = pa; ppim->va = va + offset; ppim->size = size; } /* Map L2 blocks */ pa = rounddown2(pa, L2_SIZE); for (i = 0; i < l2_blocks; i++) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_mapbios: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 1, ("pmap_mapbios: Invalid level %d", lvl)); /* Insert L2_BLOCK */ l2 = pmap_l1_to_l2(pde, va); pmap_load_store(l2, pa | ATTR_DEFAULT | ATTR_S1_XN | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK); va += L2_SIZE; pa += L2_SIZE; } pmap_invalidate_all(kernel_pmap); va = preinit_map_va + (start_idx * L2_SIZE); } else { /* kva_alloc may be used to map the pages */ offset = pa & PAGE_MASK; size = round_page(offset + size); va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl)); /* L3 table is linked */ va = trunc_page(va); pa = trunc_page(pa); pmap_kenter(va, size, pa, memory_mapping_mode(pa)); } return ((void *)(va + offset)); } void pmap_unmapbios(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset, tmpsize, va_trunc; pd_entry_t *pde; pt_entry_t *l2; int i, lvl, l2_blocks, block; bool preinit_map; l2_blocks = (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT; KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size)); /* Remove preinit mapping */ preinit_map = false; block = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va) { KASSERT(ppim->size == size, ("pmap_unmapbios: size mismatch")); ppim->va = 0; ppim->pa = 0; ppim->size = 0; preinit_map = true; offset = block * L2_SIZE; va_trunc = rounddown2(va, L2_SIZE) + offset; /* Remove L2_BLOCK */ pde = pmap_pde(kernel_pmap, va_trunc, &lvl); KASSERT(pde != NULL, ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va_trunc)); l2 = pmap_l1_to_l2(pde, va_trunc); pmap_clear(l2); if (block == (l2_blocks - 1)) break; block++; } } if (preinit_map) { pmap_invalidate_all(kernel_pmap); return; } /* Unmap the pages reserved with kva_alloc. */ if (vm_initialized) { offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl)); /* Unmap and invalidate the pages */ for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kremove(va + tmpsize); kva_free(va, size); } } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pv_memattr = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.pv_memattr) != 0) panic("memory attribute change on the direct map failed"); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the direct map or the kernel map. If * the virtual address range is contained within the kernel map, then the * memory type for each of the corresponding ranges of the direct map is also * changed. (The corresponding ranges of the direct map are those ranges that * map the same physical pages as the specified virtual address range.) These * changes to the direct map are necessary because Intel describes the * behavior of their processors as "undefined" if two or more mappings to the * same physical page have different memory types. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. In the * latter case, the memory type may have been changed on some part of the * virtual address range or the direct map. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { int error; PMAP_LOCK(kernel_pmap); error = pmap_change_attr_locked(va, size, mode); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pt_entry_t l3, *pte, *newpte; int lvl; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); if (!VIRT_IN_DMAP(base) && !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) return (EINVAL); for (tmpva = base; tmpva < base + size; ) { pte = pmap_pte(kernel_pmap, tmpva, &lvl); if (pte == NULL) return (EINVAL); if ((pmap_load(pte) & ATTR_S1_IDX_MASK) == ATTR_S1_IDX(mode)) { /* * We already have the correct attribute, * ignore this entry. */ switch (lvl) { default: panic("Invalid DMAP table level: %d\n", lvl); case 1: tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; break; case 2: tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; break; case 3: tmpva += PAGE_SIZE; break; } } else { /* * Split the entry to an level 3 table, then * set the new attribute. */ switch (lvl) { default: panic("Invalid DMAP table level: %d\n", lvl); case 1: newpte = pmap_demote_l1(kernel_pmap, pte, tmpva & ~L1_OFFSET); if (newpte == NULL) return (EINVAL); pte = pmap_l1_to_l2(pte, tmpva); case 2: newpte = pmap_demote_l2(kernel_pmap, pte, tmpva); if (newpte == NULL) return (EINVAL); pte = pmap_l2_to_l3(pte, tmpva); case 3: /* Update the entry */ l3 = pmap_load(pte); l3 &= ~ATTR_S1_IDX_MASK; l3 |= ATTR_S1_IDX(mode); if (mode == VM_MEMATTR_DEVICE) l3 |= ATTR_S1_XN; pmap_update_entry(kernel_pmap, pte, l3, tmpva, PAGE_SIZE); /* * If moving to a non-cacheable entry flush * the cache. */ if (mode == VM_MEMATTR_UNCACHEABLE) cpu_dcache_wbinv_range(tmpva, L3_SIZE); break; } tmpva += PAGE_SIZE; } } return (0); } /* * Create an L2 table to map all addresses within an L1 mapping. */ static pt_entry_t * pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) { pt_entry_t *l2, newl2, oldl1; vm_offset_t tmpl1; vm_paddr_t l2phys, phys; vm_page_t ml2; int i; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldl1 = pmap_load(l1); KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_demote_l1: Demoting a non-block entry")); KASSERT((va & L1_OFFSET) == 0, ("pmap_demote_l1: Invalid virtual address %#lx", va)); KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, ("pmap_demote_l1: Level 1 table shouldn't be managed")); tmpl1 = 0; if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { tmpl1 = kva_alloc(PAGE_SIZE); if (tmpl1 == 0) return (NULL); } if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" " in pmap %p", va, pmap); return (NULL); } l2phys = VM_PAGE_TO_PHYS(ml2); l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); /* Address the range points at */ phys = oldl1 & ~ATTR_MASK; /* The attributed from the old l1 table to be copied */ newl2 = oldl1 & ATTR_MASK; /* Create the new entries */ for (i = 0; i < Ln_ENTRIES; i++) { l2[i] = newl2 | phys; phys += L2_SIZE; } KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0], (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); if (tmpl1 != 0) { pmap_kenter(tmpl1, PAGE_SIZE, DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, VM_MEMATTR_WRITE_BACK); l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); } pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); if (tmpl1 != 0) { pmap_kremove(tmpl1); kva_free(tmpl1, PAGE_SIZE); } return (l2); } static void pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3) { pt_entry_t *l3; for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) { *l3 = newl3; newl3 += L3_SIZE; } } static void pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, struct rwlock **lockp) { struct spglist free; SLIST_INIT(&free); (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, lockp); vm_page_free_pages_toq(&free, true); } /* * Create an L3 table to map all addresses within an L2 mapping. */ static pt_entry_t * pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *l3, newl3, oldl2; vm_offset_t tmpl2; vm_paddr_t l3phys; vm_page_t ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PMAP_ASSERT_STAGE1(pmap); l3 = NULL; oldl2 = pmap_load(l2); KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_demote_l2: Demoting a non-block entry")); va &= ~L2_OFFSET; tmpl2 = 0; if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { tmpl2 = kva_alloc(PAGE_SIZE); if (tmpl2 == 0) return (NULL); } /* * Invalidate the 2MB page mapping and return "failure" if the * mapping was never accessed. */ if ((oldl2 & ATTR_AF) == 0) { KASSERT((oldl2 & ATTR_SW_WIRED) == 0, ("pmap_demote_l2: a wired mapping is missing ATTR_AF")); pmap_demote_l2_abort(pmap, va, l2, lockp); CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p", va, pmap); goto fail; } if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldl2 & ATTR_SW_WIRED) == 0, ("pmap_demote_l2: page table page for a wired mapping" " is missing")); /* * If the page table page is missing and the mapping * is for a kernel address, the mapping must belong to * the direct map. Page table pages are preallocated * for every other part of the kernel address space, * so the direct map region is the only part of the * kernel address space that must be handled here. */ KASSERT(va < VM_MAXUSER_ADDRESS || VIRT_IN_DMAP(va), ("pmap_demote_l2: No saved mpte for va %#lx", va)); /* * If the 2MB page mapping belongs to the direct map * region of the kernel's address space, then the page * allocation request specifies the highest possible * priority (VM_ALLOC_INTERRUPT). Otherwise, the * priority is normal. */ ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); /* * If the allocation of the new page table page fails, * invalidate the 2MB page mapping and return "failure". */ if (ml3 == NULL) { pmap_demote_l2_abort(pmap, va, l2, lockp); CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" " in pmap %p", va, pmap); goto fail; } if (va < VM_MAXUSER_ADDRESS) { ml3->ref_count = NL3PG; pmap_resident_count_inc(pmap, 1); } } l3phys = VM_PAGE_TO_PHYS(ml3); l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE; KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM), ("pmap_demote_l2: L2 entry is writeable but not dirty")); /* * If the page table page is not leftover from an earlier promotion, * or the mapping attributes have changed, (re)initialize the L3 table. * * When pmap_update_entry() clears the old L2 mapping, it (indirectly) * performs a dsb(). That dsb() ensures that the stores for filling * "l3" are visible before "l3" is added to the page table. */ if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK)) pmap_fill_l3(l3, newl3); /* * Map the temporary page so we don't lose access to the l2 table. */ if (tmpl2 != 0) { pmap_kenter(tmpl2, PAGE_SIZE, DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, VM_MEMATTR_WRITE_BACK); l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); } /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the L2 and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_l2() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldl2 & ATTR_SW_MANAGED) != 0) reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); /* * Pass PAGE_SIZE so that a single TLB invalidation is performed on * the 2MB page mapping. */ pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); /* * Demote the PV entry. */ if ((oldl2 & ATTR_SW_MANAGED) != 0) pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp); atomic_add_long(&pmap_l2_demotions, 1); CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" " in pmap %p %lx", va, pmap, l3[0]); fail: if (tmpl2 != 0) { pmap_kremove(tmpl2); kva_free(tmpl2, PAGE_SIZE); } return (l3); } static pt_entry_t * pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { struct rwlock *lock; pt_entry_t *l3; lock = NULL; l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); if (lock != NULL) rw_wunlock(lock); return (l3); } /* * Perform the pmap work for mincore(2). If the page is not both referenced and * modified by this pmap, returns its physical address so that the caller can * find other mappings. */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { pt_entry_t *pte, tpte; vm_paddr_t mask, pa; int lvl, val; bool managed; PMAP_ASSERT_STAGE1(pmap); PMAP_LOCK(pmap); pte = pmap_pte(pmap, addr, &lvl); if (pte != NULL) { tpte = pmap_load(pte); switch (lvl) { case 3: mask = L3_OFFSET; break; case 2: mask = L2_OFFSET; break; case 1: mask = L1_OFFSET; break; default: panic("pmap_mincore: invalid level %d", lvl); } managed = (tpte & ATTR_SW_MANAGED) != 0; val = MINCORE_INCORE; if (lvl != 3) val |= MINCORE_PSIND(3 - lvl); if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed && (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((tpte & ATTR_AF) == ATTR_AF) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; pa = (tpte & ~ATTR_MASK) | (addr & mask); } else { managed = false; val = 0; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { *pap = pa; } PMAP_UNLOCK(pmap); return (val); } /* * Garbage collect every ASID that is neither active on a processor nor * reserved. */ static void pmap_reset_asid_set(pmap_t pmap) { pmap_t curpmap; int asid, cpuid, epoch; struct asid_set *set; enum pmap_stage stage; set = pmap->pm_asid_set; stage = pmap->pm_stage; set = pmap->pm_asid_set; KASSERT(set != NULL, ("%s: NULL asid set", __func__)); mtx_assert(&set->asid_set_mutex, MA_OWNED); /* * Ensure that the store to asid_epoch is globally visible before the * loads from pc_curpmap are performed. */ epoch = set->asid_epoch + 1; if (epoch == INT_MAX) epoch = 0; set->asid_epoch = epoch; dsb(ishst); if (stage == PM_STAGE1) { __asm __volatile("tlbi vmalle1is"); } else { KASSERT(pmap_clean_stage2_tlbi != NULL, ("%s: Unset stage 2 tlb invalidation callback\n", __func__)); pmap_clean_stage2_tlbi(); } dsb(ish); bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE, set->asid_set_size - 1); CPU_FOREACH(cpuid) { if (cpuid == curcpu) continue; if (stage == PM_STAGE1) { curpmap = pcpu_find(cpuid)->pc_curpmap; PMAP_ASSERT_STAGE1(pmap); } else { curpmap = pcpu_find(cpuid)->pc_curvmpmap; if (curpmap == NULL) continue; PMAP_ASSERT_STAGE2(pmap); } KASSERT(curpmap->pm_asid_set == set, ("Incorrect set")); asid = COOKIE_TO_ASID(curpmap->pm_cookie); if (asid == -1) continue; bit_set(set->asid_set, asid); curpmap->pm_cookie = COOKIE_FROM(asid, epoch); } } /* * Allocate a new ASID for the specified pmap. */ static void pmap_alloc_asid(pmap_t pmap) { struct asid_set *set; int new_asid; set = pmap->pm_asid_set; KASSERT(set != NULL, ("%s: NULL asid set", __func__)); mtx_lock_spin(&set->asid_set_mutex); /* * While this processor was waiting to acquire the asid set mutex, * pmap_reset_asid_set() running on another processor might have * updated this pmap's cookie to the current epoch. In which case, we * don't need to allocate a new ASID. */ if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) goto out; bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size, &new_asid); if (new_asid == -1) { bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, set->asid_next, &new_asid); if (new_asid == -1) { pmap_reset_asid_set(pmap); bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, set->asid_set_size, &new_asid); KASSERT(new_asid != -1, ("ASID allocation failure")); } } bit_set(set->asid_set, new_asid); set->asid_next = new_asid + 1; pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch); out: mtx_unlock_spin(&set->asid_set_mutex); } /* * Compute the value that should be stored in ttbr0 to activate the specified * pmap. This value may change from time to time. */ uint64_t pmap_to_ttbr0(pmap_t pmap) { return (ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)) | pmap->pm_l0_paddr); } static bool pmap_activate_int(pmap_t pmap) { struct asid_set *set; int epoch; KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap")); KASSERT(pmap != kernel_pmap, ("kernel pmap activation")); if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) || (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) { /* * Handle the possibility that the old thread was preempted * after an "ic" or "tlbi" instruction but before it performed * a "dsb" instruction. If the old thread migrates to a new * processor, its completion of a "dsb" instruction on that * new processor does not guarantee that the "ic" or "tlbi" * instructions performed on the old processor have completed. */ dsb(ish); return (false); } set = pmap->pm_asid_set; KASSERT(set != NULL, ("%s: NULL asid set", __func__)); /* * Ensure that the store to curpmap is globally visible before the * load from asid_epoch is performed. */ if (pmap->pm_stage == PM_STAGE1) PCPU_SET(curpmap, pmap); else PCPU_SET(curvmpmap, pmap); dsb(ish); epoch = COOKIE_TO_EPOCH(pmap->pm_cookie); if (epoch >= 0 && epoch != set->asid_epoch) pmap_alloc_asid(pmap); if (pmap->pm_stage == PM_STAGE1) { set_ttbr0(pmap_to_ttbr0(pmap)); if (PCPU_GET(bcast_tlbi_workaround) != 0) invalidate_local_icache(); } return (true); } void pmap_activate_vm(pmap_t pmap) { PMAP_ASSERT_STAGE2(pmap); (void)pmap_activate_int(pmap); } void pmap_activate(struct thread *td) { pmap_t pmap; pmap = vmspace_pmap(td->td_proc->p_vmspace); PMAP_ASSERT_STAGE1(pmap); critical_enter(); (void)pmap_activate_int(pmap); critical_exit(); } /* * To eliminate the unused parameter "old", we would have to add an instruction * to cpu_switch(). */ struct pcb * pmap_switch(struct thread *old __unused, struct thread *new) { pcpu_bp_harden bp_harden; struct pcb *pcb; /* Store the new curthread */ PCPU_SET(curthread, new); /* And the new pcb */ pcb = new->td_pcb; PCPU_SET(curpcb, pcb); /* * TODO: We may need to flush the cache here if switching * to a user process. */ if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) { /* * Stop userspace from training the branch predictor against * other processes. This will call into a CPU specific * function that clears the branch predictor state. */ bp_harden = PCPU_GET(bp_harden); if (bp_harden != NULL) bp_harden(); } return (pcb); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) { PMAP_ASSERT_STAGE1(pmap); if (va >= VM_MIN_KERNEL_ADDRESS) { cpu_icache_sync_range(va, sz); } else { u_int len, offset; vm_paddr_t pa; /* Find the length of data in this page to flush */ offset = va & PAGE_MASK; len = imin(PAGE_SIZE - offset, sz); while (sz != 0) { /* Extract the physical address & find it in the DMAP */ pa = pmap_extract(pmap, va); if (pa != 0) cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); /* Move to the next page */ sz -= len; va += len; /* Set the length for the next iteration */ len = imin(PAGE_SIZE, sz); } } } static int pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far) { pd_entry_t *pdep; pt_entry_t *ptep, pte; int rv, lvl, dfsc; PMAP_ASSERT_STAGE2(pmap); rv = KERN_FAILURE; /* Data and insn aborts use same encoding for FSC field. */ dfsc = esr & ISS_DATA_DFSC_MASK; switch (dfsc) { case ISS_DATA_DFSC_TF_L0: case ISS_DATA_DFSC_TF_L1: case ISS_DATA_DFSC_TF_L2: case ISS_DATA_DFSC_TF_L3: PMAP_LOCK(pmap); pdep = pmap_pde(pmap, far, &lvl); if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) { PMAP_LOCK(pmap); break; } switch (lvl) { case 0: ptep = pmap_l0_to_l1(pdep, far); break; case 1: ptep = pmap_l1_to_l2(pdep, far); break; case 2: ptep = pmap_l2_to_l3(pdep, far); break; default: panic("%s: Invalid pde level %d", __func__,lvl); } goto fault_exec; case ISS_DATA_DFSC_AFF_L1: case ISS_DATA_DFSC_AFF_L2: case ISS_DATA_DFSC_AFF_L3: PMAP_LOCK(pmap); ptep = pmap_pte(pmap, far, &lvl); fault_exec: if (ptep != NULL && (pte = pmap_load(ptep)) != 0) { if (icache_vmid) { pmap_invalidate_vpipt_icache(); } else { /* * If accessing an executable page invalidate * the I-cache so it will be valid when we * continue execution in the guest. The D-cache * is assumed to already be clean to the Point * of Coherency. */ if ((pte & ATTR_S2_XN_MASK) != ATTR_S2_XN(ATTR_S2_XN_NONE)) { invalidate_icache(); } } pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID); rv = KERN_SUCCESS; } PMAP_UNLOCK(pmap); break; } return (rv); } int pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) { pt_entry_t pte, *ptep; register_t intr; uint64_t ec, par; int lvl, rv; rv = KERN_FAILURE; ec = ESR_ELx_EXCEPTION(esr); switch (ec) { case EXCP_INSN_ABORT_L: case EXCP_INSN_ABORT: case EXCP_DATA_ABORT_L: case EXCP_DATA_ABORT: break; default: return (rv); } if (pmap->pm_stage == PM_STAGE2) return (pmap_stage2_fault(pmap, esr, far)); /* Data and insn aborts use same encoding for FSC field. */ switch (esr & ISS_DATA_DFSC_MASK) { case ISS_DATA_DFSC_AFF_L1: case ISS_DATA_DFSC_AFF_L2: case ISS_DATA_DFSC_AFF_L3: PMAP_LOCK(pmap); ptep = pmap_pte(pmap, far, &lvl); if (ptep != NULL) { pmap_set_bits(ptep, ATTR_AF); rv = KERN_SUCCESS; /* * XXXMJ as an optimization we could mark the entry * dirty if this is a write fault. */ } PMAP_UNLOCK(pmap); break; case ISS_DATA_DFSC_PF_L1: case ISS_DATA_DFSC_PF_L2: case ISS_DATA_DFSC_PF_L3: if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || (esr & ISS_DATA_WnR) == 0) return (rv); PMAP_LOCK(pmap); ptep = pmap_pte(pmap, far, &lvl); if (ptep != NULL && ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) { if ((pte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RO)) { pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT); pmap_invalidate_page(pmap, far); } rv = KERN_SUCCESS; } PMAP_UNLOCK(pmap); break; case ISS_DATA_DFSC_TF_L0: case ISS_DATA_DFSC_TF_L1: case ISS_DATA_DFSC_TF_L2: case ISS_DATA_DFSC_TF_L3: /* * Retry the translation. A break-before-make sequence can * produce a transient fault. */ if (pmap == kernel_pmap) { /* * The translation fault may have occurred within a * critical section. Therefore, we must check the * address without acquiring the kernel pmap's lock. */ if (pmap_kextract(far) != 0) rv = KERN_SUCCESS; } else { PMAP_LOCK(pmap); /* Ask the MMU to check the address. */ intr = intr_disable(); par = arm64_address_translate_s1e0r(far); intr_restore(intr); PMAP_UNLOCK(pmap); /* * If the translation was successful, then we can * return success to the trap handler. */ if (PAR_SUCCESS(par)) rv = KERN_SUCCESS; } break; } return (rv); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < L2_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & L2_OFFSET; if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || (*addr & L2_OFFSET) == superpage_offset) return; if ((*addr & L2_OFFSET) < superpage_offset) *addr = (*addr & ~L2_OFFSET) + superpage_offset; else *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; int error, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(!PHYS_IN_DMAP(paddr))) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic( "pmap_map_io_transient: TODO: Map out of DMAP data"); } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); } } } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH); } /* * Track a range of the kernel's virtual address space that is contiguous * in various mapping attributes. */ struct pmap_kernel_map_range { vm_offset_t sva; pt_entry_t attrs; int l3pages; int l3contig; int l2blocks; int l1blocks; }; static void sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t eva) { const char *mode; int index; if (eva <= range->sva) return; index = range->attrs & ATTR_S1_IDX_MASK; switch (index) { case ATTR_S1_IDX(VM_MEMATTR_DEVICE): mode = "DEV"; break; case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE): mode = "UC"; break; case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK): mode = "WB"; break; case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH): mode = "WT"; break; default: printf( "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n", __func__, index, range->sva, eva); mode = "??"; break; } sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c %3s %d %d %d %d\n", range->sva, eva, (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-', (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x', (range->attrs & ATTR_S1_AP_USER) != 0 ? 'u' : 's', mode, range->l1blocks, range->l2blocks, range->l3contig, range->l3pages); /* Reset to sentinel value. */ range->sva = 0xfffffffffffffffful; } /* * Determine whether the attributes specified by a page table entry match those * being tracked by the current range. */ static bool sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) { return (range->attrs == attrs); } static void sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, pt_entry_t attrs) { memset(range, 0, sizeof(*range)); range->sva = va; range->attrs = attrs; } /* * Given a leaf PTE, derive the mapping's attributes. If they do not match * those of the current run, dump the address range and its attributes, and * begin a new run. */ static void sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e) { pt_entry_t attrs; attrs = l0e & (ATTR_S1_AP_MASK | ATTR_S1_XN); attrs |= l1e & (ATTR_S1_AP_MASK | ATTR_S1_XN); if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) attrs |= l1e & ATTR_S1_IDX_MASK; attrs |= l2e & (ATTR_S1_AP_MASK | ATTR_S1_XN); if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) attrs |= l2e & ATTR_S1_IDX_MASK; attrs |= l3e & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK); if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { sysctl_kmaps_dump(sb, range, va); sysctl_kmaps_reinit(range, va, attrs); } } static int sysctl_kmaps(SYSCTL_HANDLER_ARGS) { struct pmap_kernel_map_range range; struct sbuf sbuf, *sb; pd_entry_t l0e, *l1, l1e, *l2, l2e; pt_entry_t *l3, l3e; vm_offset_t sva; vm_paddr_t pa; int error, i, j, k, l; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = &sbuf; sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); /* Sentinel value. */ range.sva = 0xfffffffffffffffful; /* * Iterate over the kernel page tables without holding the kernel pmap * lock. Kernel page table pages are never freed, so at worst we will * observe inconsistencies in the output. */ for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES; i++) { if (i == pmap_l0_index(DMAP_MIN_ADDRESS)) sbuf_printf(sb, "\nDirect map:\n"); else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS)) sbuf_printf(sb, "\nKernel map:\n"); l0e = kernel_pmap->pm_l0[i]; if ((l0e & ATTR_DESCR_VALID) == 0) { sysctl_kmaps_dump(sb, &range, sva); sva += L0_SIZE; continue; } pa = l0e & ~ATTR_MASK; l1 = (pd_entry_t *)PHYS_TO_DMAP(pa); for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) { l1e = l1[j]; if ((l1e & ATTR_DESCR_VALID) == 0) { sysctl_kmaps_dump(sb, &range, sva); sva += L1_SIZE; continue; } if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) { sysctl_kmaps_check(sb, &range, sva, l0e, l1e, 0, 0); range.l1blocks++; sva += L1_SIZE; continue; } pa = l1e & ~ATTR_MASK; l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) { l2e = l2[k]; if ((l2e & ATTR_DESCR_VALID) == 0) { sysctl_kmaps_dump(sb, &range, sva); sva += L2_SIZE; continue; } if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) { sysctl_kmaps_check(sb, &range, sva, l0e, l1e, l2e, 0); range.l2blocks++; sva += L2_SIZE; continue; } pa = l2e & ~ATTR_MASK; l3 = (pt_entry_t *)PHYS_TO_DMAP(pa); for (l = pmap_l3_index(sva); l < Ln_ENTRIES; l++, sva += L3_SIZE) { l3e = l3[l]; if ((l3e & ATTR_DESCR_VALID) == 0) { sysctl_kmaps_dump(sb, &range, sva); continue; } sysctl_kmaps_check(sb, &range, sva, l0e, l1e, l2e, l3e); if ((l3e & ATTR_CONTIGUOUS) != 0) range.l3contig += l % 16 == 0 ? 1 : 0; else range.l3pages++; } } } } error = sbuf_finish(sb); sbuf_delete(sb); return (error); } SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_kmaps, "A", "Dump kernel address layout"); Index: head/sys/arm64/arm64/uma_machdep.c =================================================================== --- head/sys/arm64/arm64/uma_machdep.c (revision 366710) +++ head/sys/arm64/arm64/uma_machdep.c (revision 366711) @@ -1,77 +1,78 @@ /*- * Copyright (c) 2003 Alan L. Cox * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include +#include #include #include +#include #include #include #include -#include void * uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, int wait) { vm_page_t m; vm_paddr_t pa; void *va; *flags = UMA_SLAB_PRIV; m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) return (NULL); pa = m->phys_addr; if ((wait & M_NODUMP) == 0) dump_add_page(pa); va = (void *)PHYS_TO_DMAP(pa); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero(va, PAGE_SIZE); return (va); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; vm_paddr_t pa; pa = DMAP_TO_PHYS((vm_offset_t)mem); dump_drop_page(pa); m = PHYS_TO_VM_PAGE(pa); vm_page_unwire_noq(m); vm_page_free(m); } Index: head/sys/i386/i386/machdep.c =================================================================== --- head/sys/i386/i386/machdep.c (revision 366710) +++ head/sys/i386/i386/machdep.c (revision 366711) @@ -1,3257 +1,3258 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2018 The FreeBSD Foundation * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_apic.h" #include "opt_atpic.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_perfmon.h" #include "opt_platform.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include -#include #include +#include #ifdef DDB #ifndef KDB #error KDB must be enabled in order for DDB to work! #endif #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PERFMON #include #endif #ifdef SMP #include #endif #ifdef FDT #include #endif #ifdef DEV_APIC #include #endif #ifdef DEV_ISA #include #endif /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); register_t init386(int first); void dblfault_handler(void); void identify_cpu(void); static void cpu_startup(void *); static void fpstate_drop(struct thread *td); static void get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, size_t xfpusave_len); static int set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, size_t xfpustate_len); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); /* Intel ICH registers */ #define ICH_PMBASE 0x400 #define ICH_SMI_EN ICH_PMBASE + 0x30 int _udatasel, _ucodesel; u_int basemem; static int above4g_allow = 1; static int above24g_allow = 0; int cold = 1; #ifdef COMPAT_43 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); #endif #ifdef COMPAT_FREEBSD4 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); #endif long Maxmem = 0; long realmem = 0; #ifdef PAE FEATURE(pae, "Physical Address Extensions"); #endif struct kva_md_info kmi; static struct trapframe proc0_tf; struct pcpu __pcpu[MAXCPU]; struct mtx icu_lock; struct mem_range_softc mem_range_softc; extern char start_exceptions[], end_exceptions[]; extern struct sysentvec elf32_freebsd_sysvec; /* Default init_ops implementation. */ struct init_ops init_ops = { .early_clock_source_init = i8254_init, .early_delay = i8254_delay, #ifdef DEV_APIC .msi_init = msi_init, #endif }; static void cpu_startup(dummy) void *dummy; { uintmax_t memsize; char *sysenv; /* * On MacBooks, we need to disallow the legacy USB circuit to * generate an SMI# because this can cause several problems, * namely: incorrect CPU frequency detection and failure to * start the APs. * We do this by disabling a bit in the SMI_EN (SMI Control and * Enable register) of the Intel ICH LPC Interface Bridge. */ sysenv = kern_getenv("smbios.system.product"); if (sysenv != NULL) { if (strncmp(sysenv, "MacBook1,1", 10) == 0 || strncmp(sysenv, "MacBook3,1", 10) == 0 || strncmp(sysenv, "MacBook4,1", 10) == 0 || strncmp(sysenv, "MacBookPro1,1", 13) == 0 || strncmp(sysenv, "MacBookPro1,2", 13) == 0 || strncmp(sysenv, "MacBookPro3,1", 13) == 0 || strncmp(sysenv, "MacBookPro4,1", 13) == 0 || strncmp(sysenv, "Macmini1,1", 10) == 0) { if (bootverbose) printf("Disabling LEGACY_USB_EN bit on " "Intel ICH.\n"); outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); } freeenv(sysenv); } /* * Good {morning,afternoon,evening,night}. */ startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif /* * Display physical memory if SMBIOS reports reasonable amount. */ memsize = 0; sysenv = kern_getenv("smbios.memory.enabled"); if (sysenv != NULL) { memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; freeenv(sysenv); } if (memsize < ptoa((uintmax_t)vm_free_count())) memsize = ptoa((uintmax_t)Maxmem); printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); realmem = atop(memsize); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size; size = phys_avail[indx + 1] - phys_avail[indx]; printf( "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", (uintmax_t)phys_avail[indx], (uintmax_t)phys_avail[indx + 1] - 1, (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", ptoa((uintmax_t)vm_free_count()), ptoa((uintmax_t)vm_free_count()) / 1048576); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); cpu_setregs(); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by call * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ #ifdef COMPAT_43 static void osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct osigframe sf, *fp; struct proc *p; struct thread *td; struct sigacts *psp; struct trapframe *regs; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct osigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct osigframe)); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else fp = (struct osigframe *)regs->tf_esp - 1; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_arg2 = (register_t)&fp->sf_siginfo; sf.sf_siginfo.si_signo = sig; sf.sf_siginfo.si_code = ksi->ksi_code; sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; sf.sf_addr = 0; } else { /* Old FreeBSD-style arguments. */ sf.sf_arg2 = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Save most if not all of trap frame. */ sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; sf.sf_siginfo.si_sc.sc_es = regs->tf_es; sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; sf.sf_siginfo.si_sc.sc_gs = rgs(); sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; /* Build the signal context to be used by osigreturn(). */ sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; sf.sf_siginfo.si_sc.sc_err = regs->tf_err; /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_siginfo.si_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* See sendsig() for comments. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(*fp)) != 0) { PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_esp = (int)fp; if (p->p_sysent->sv_sigcode_base != 0) { regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - szosigcode; } else { /* a.out sysentvec does not use shared page */ regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode; } regs->tf_eflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; load_gs(_udatasel); regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #endif /* COMPAT_43 */ #ifdef COMPAT_FREEBSD4 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe4 sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; struct trapframe *regs; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_gs = rgs(); bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); bzero(sf.sf_uc.uc_mcontext.mc_fpregs, sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); bzero(sf.sf_uc.uc_mcontext.__spare__, sizeof(sf.sf_uc.uc_mcontext.__spare__)); bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe4 *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct sigframe4)); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sfp = (struct sigframe4 *)regs->tf_esp - 1; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (register_t)&sfp->sf_si; sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si.si_signo = sig; sf.sf_si.si_code = ksi->ksi_code; sf.sf_si.si_addr = ksi->ksi_addr; } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_uc.uc_mcontext.mc_eflags = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_esp = (int)sfp; regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - szfreebsd4_sigcode; regs->tf_eflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #endif /* COMPAT_FREEBSD4 */ void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; char *sp; struct trapframe *regs; struct segment_descriptor *sdp; char *xfpusave; size_t xfpusave_len; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); #ifdef COMPAT_FREEBSD4 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { freebsd4_sendsig(catcher, ksi, mask); return; } #endif #ifdef COMPAT_43 if (SIGISMEMBER(psp->ps_osigset, sig)) { osendsig(catcher, ksi, mask); return; } #endif regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); if (cpu_max_ext_state_size > sizeof(union savefpu) && use_xsave) { xfpusave_len = cpu_max_ext_state_size - sizeof(union savefpu); xfpusave = __builtin_alloca(xfpusave_len); } else { xfpusave_len = 0; xfpusave = NULL; } /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_gs = rgs(); bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); fpstate_drop(td); /* * Unconditionally fill the fsbase and gsbase into the mcontext. */ sdp = &td->td_pcb->pcb_fsd; sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; sdp = &td->td_pcb->pcb_gsd; sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; bzero(sf.sf_uc.uc_mcontext.mc_spare2, sizeof(sf.sf_uc.uc_mcontext.mc_spare2)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sp = (char *)regs->tf_esp - 128; if (xfpusave != NULL) { sp -= xfpusave_len; sp = (char *)((unsigned int)sp & ~0x3F); sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; } sp -= sizeof(struct sigframe); /* Align to 16 bytes. */ sfp = (struct sigframe *)((unsigned int)sp & ~0xF); /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (register_t)&sfp->sf_si; sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_uc.uc_mcontext.mc_eflags = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || (xfpusave != NULL && copyout(xfpusave, (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) != 0)) { PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_esp = (int)sfp; regs->tf_eip = p->p_sysent->sv_sigcode_base; if (regs->tf_eip == 0) regs->tf_eip = p->p_sysent->sv_psstrings - szsigcode; regs->tf_eflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. * * MPSAFE */ #ifdef COMPAT_43 int osigreturn(td, uap) struct thread *td; struct osigreturn_args /* { struct osigcontext *sigcntxp; } */ *uap; { struct osigcontext sc; struct trapframe *regs; struct osigcontext *scp; int eflags, error; ksiginfo_t ksi; regs = td->td_frame; error = copyin(uap->sigcntxp, &sc, sizeof(sc)); if (error != 0) return (error); scp = ≻ eflags = scp->sc_ps; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (td->td_pcb->pcb_ext == 0) return (EINVAL); vm86 = &td->td_pcb->pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); } if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } tf->tf_vm86_ds = scp->sc_ds; tf->tf_vm86_es = scp->sc_es; tf->tf_vm86_fs = scp->sc_fs; tf->tf_vm86_gs = scp->sc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(eflags, regs->tf_eflags)) { return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ if (!CS_SECURE(scp->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } regs->tf_ds = scp->sc_ds; regs->tf_es = scp->sc_es; regs->tf_fs = scp->sc_fs; } /* Restore remaining registers. */ regs->tf_eax = scp->sc_eax; regs->tf_ebx = scp->sc_ebx; regs->tf_ecx = scp->sc_ecx; regs->tf_edx = scp->sc_edx; regs->tf_esi = scp->sc_esi; regs->tf_edi = scp->sc_edi; regs->tf_cs = scp->sc_cs; regs->tf_ss = scp->sc_ss; regs->tf_isp = scp->sc_isp; regs->tf_ebp = scp->sc_fp; regs->tf_esp = scp->sc_sp; regs->tf_eip = scp->sc_pc; regs->tf_eflags = eflags; #if defined(COMPAT_43) if (scp->sc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL, SIGPROCMASK_OLD); return (EJUSTRETURN); } #endif /* COMPAT_43 */ #ifdef COMPAT_FREEBSD4 /* * MPSAFE */ int freebsd4_sigreturn(td, uap) struct thread *td; struct freebsd4_sigreturn_args /* { const ucontext4 *sigcntxp; } */ *uap; { struct ucontext4 uc; struct trapframe *regs; struct ucontext4 *ucp; int cs, eflags, error; ksiginfo_t ksi; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) return (error); ucp = &uc; regs = td->td_frame; eflags = ucp->uc_mcontext.mc_eflags; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (td->td_pcb->pcb_ext == 0) return (EINVAL); vm86 = &td->td_pcb->pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); } if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); tf->tf_eflags = eflags; tf->tf_vm86_ds = tf->tf_ds; tf->tf_vm86_es = tf->tf_es; tf->tf_vm86_fs = tf->tf_fs; tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(eflags, regs->tf_eflags)) { uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n", td->td_proc->p_pid, td->td_name, eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", td->td_proc->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); } #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); return (EJUSTRETURN); } #endif /* COMPAT_FREEBSD4 */ /* * MPSAFE */ int sys_sigreturn(td, uap) struct thread *td; struct sigreturn_args /* { const struct __ucontext *sigcntxp; } */ *uap; { ucontext_t uc; struct proc *p; struct trapframe *regs; ucontext_t *ucp; char *xfpustate; size_t xfpustate_len; int cs, eflags, error, ret; ksiginfo_t ksi; p = td->td_proc; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) return (error); ucp = &uc; if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, td->td_name, ucp->uc_mcontext.mc_flags); return (EINVAL); } regs = td->td_frame; eflags = ucp->uc_mcontext.mc_eflags; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (td->td_pcb->pcb_ext == 0) return (EINVAL); vm86 = &td->td_pcb->pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); } if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); tf->tf_eflags = eflags; tf->tf_vm86_ds = tf->tf_ds; tf->tf_vm86_es = tf->tf_es; tf->tf_vm86_fs = tf->tf_fs; tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(eflags, regs->tf_eflags)) { uprintf("pid %d (%s): sigreturn eflags = 0x%x\n", td->td_proc->p_pid, td->td_name, eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): sigreturn cs = 0x%x\n", td->td_proc->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; if (xfpustate_len > cpu_max_ext_state_size - sizeof(union savefpu)) { uprintf( "pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", p->p_pid, td->td_name, xfpustate_len); return (EINVAL); } xfpustate = __builtin_alloca(xfpustate_len); error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, xfpustate, xfpustate_len); if (error != 0) { uprintf( "pid %d (%s): sigreturn copying xfpustate failed\n", p->p_pid, td->td_name); return (error); } } else { xfpustate = NULL; xfpustate_len = 0; } ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); if (ret != 0) return (ret); bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); } #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); return (EJUSTRETURN); } #ifdef COMPAT_43 static void setup_priv_lcall_gate(struct proc *p) { struct i386_ldt_args uap; union descriptor desc; u_int lcall_addr; bzero(&uap, sizeof(uap)); uap.start = 0; uap.num = 1; lcall_addr = p->p_sysent->sv_psstrings - sz_lcall_tramp; bzero(&desc, sizeof(desc)); desc.sd.sd_type = SDT_MEMERA; desc.sd.sd_dpl = SEL_UPL; desc.sd.sd_p = 1; desc.sd.sd_def32 = 1; desc.sd.sd_gran = 1; desc.sd.sd_lolimit = 0xffff; desc.sd.sd_hilimit = 0xf; desc.sd.sd_lobase = lcall_addr; desc.sd.sd_hibase = lcall_addr >> 24; i386_set_ldt(curthread, &uap, &desc); } #endif /* * Reset registers to default values on exec. */ void exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs; struct pcb *pcb; register_t saved_eflags; regs = td->td_frame; pcb = td->td_pcb; /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ pcb->pcb_gs = _udatasel; load_gs(_udatasel); mtx_lock_spin(&dt_lock); if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); else mtx_unlock_spin(&dt_lock); #ifdef COMPAT_43 if (td->td_proc->p_sysent->sv_psstrings != elf32_freebsd_sysvec.sv_psstrings) setup_priv_lcall_gate(td->td_proc); #endif /* * Reset the fs and gs bases. The values from the old address * space do not make sense for the new program. In particular, * gsbase might be the TLS base for the old program but the new * program has no TLS now. */ set_fsbase(td, 0); set_gsbase(td, 0); /* Make sure edx is 0x0 on entry. Linux binaries depend on it. */ saved_eflags = regs->tf_eflags & PSL_T; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_eip = imgp->entry_addr; regs->tf_esp = stack; regs->tf_eflags = PSL_USER | saved_eflags; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_cs = _ucodesel; /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ regs->tf_ebx = (register_t)imgp->ps_strings; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == curpcb) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } pcb->pcb_flags &= ~PCB_DBREGS; } pcb->pcb_initial_npxcw = __INITIAL_NPXCW__; /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); } void cpu_setregs(void) { unsigned int cr0; cr0 = rcr0(); /* * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: * * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT * instructions. We must set the CR0_MP bit and use the CR0_TS * bit to control the trap, because setting the CR0_EM bit does * not cause WAIT instructions to trap. It's important to trap * WAIT instructions - otherwise the "wait" variants of no-wait * control instructions would degenerate to the "no-wait" variants * after FP context switches but work correctly otherwise. It's * particularly important to trap WAITs when there is no NPX - * otherwise the "wait" variants would always degenerate. * * Try setting CR0_NE to get correct error reporting on 486DX's. * Setting it should fail or do nothing on lesser processors. */ cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; load_cr0(cr0); load_gs(_udatasel); } u_long bootdev; /* not a struct cdev *- encoding is different */ SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); static char bootmethod[16] = "BIOS"; SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, "System firmware boot method"); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int _default_ldt; struct mtx dt_lock; /* lock for GDT and LDT */ union descriptor gdt0[NGDT]; /* initial global descriptor table */ union descriptor *gdt = gdt0; /* global descriptor table */ union descriptor *ldt; /* local descriptor table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ static struct i386tss *dblfault_tss; static char *dblfault_stack; static struct i386tss common_tss0; vm_offset_t proc0kstack; /* * software prototypes -- in more palatable form. * * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = SEL_KPL, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUFS_SEL 2 %fs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUGS_SEL 3 %gs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GCODE_SEL 4 Code Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GDATA_SEL 5 Data Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUCODE_SEL 6 Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUDATA_SEL 7 Data Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ { .ssd_base = 0x400, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { .ssd_base = 0x0, .ssd_limit = sizeof(struct i386tss)-1, .ssd_type = SDT_SYS386TSS, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GLDT_SEL 10 LDT Descriptor */ { .ssd_base = 0, .ssd_limit = sizeof(union descriptor) * NLDT - 1, .ssd_type = SDT_SYSLDT, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 11 User LDT Descriptor per process */ { .ssd_base = 0, .ssd_limit = (512 * sizeof(union descriptor)-1), .ssd_type = SDT_SYSLDT, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GPANIC_SEL 12 Panic Tss Descriptor */ { .ssd_base = 0, .ssd_limit = sizeof(struct i386tss)-1, .ssd_type = SDT_SYS386TSS, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 1 }, /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 1 }, /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 1 }, /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 1 }, /* GNDIS_SEL 18 NDIS Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Null Descriptor - overwritten by call gate */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Null Descriptor - overwritten by call gate */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* Null Descriptor - overwritten by call gate */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Data Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, }; uintptr_t setidt_disp; void setidt(int idx, inthand_t *func, int typ, int dpl, int selec) { uintptr_t off; off = func != NULL ? (uintptr_t)func + setidt_disp : 0; setidt_nodisp(idx, off, typ, dpl, selec); } void setidt_nodisp(int idx, uintptr_t off, int typ, int dpl, int selec) { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = off; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((u_int)off) >> 16 ; } extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), #ifdef KDTRACE_HOOKS IDTVEC(dtrace_ret), #endif #ifdef XENHVM IDTVEC(xen_intr_upcall), #endif IDTVEC(int0x80_syscall); #ifdef DDB /* * Display the index and function name of any IDT entries that don't use * the default 'rsvd' entry point. */ DB_SHOW_COMMAND(idt, db_show_idt) { struct gate_descriptor *ip; int idx; uintptr_t func, func_trm; bool trm; ip = idt; for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { if (ip->gd_type == SDT_SYSTASKGT) { db_printf("%3d\t\n", idx); } else { func = (ip->gd_hioffset << 16 | ip->gd_looffset); if (func >= PMAP_TRM_MIN_ADDRESS) { func_trm = func; func -= setidt_disp; trm = true; } else trm = false; if (func != (uintptr_t)&IDTVEC(rsvd)) { db_printf("%3d\t", idx); db_printsym(func, DB_STGY_PROC); if (trm) db_printf(" (trampoline %#x)", func_trm); db_printf("\n"); } } ip++; } } /* Show privileged registers. */ DB_SHOW_COMMAND(sysregs, db_show_sysregs) { uint64_t idtr, gdtr; idtr = ridt(); db_printf("idtr\t0x%08x/%04x\n", (u_int)(idtr >> 16), (u_int)idtr & 0xffff); gdtr = rgdt(); db_printf("gdtr\t0x%08x/%04x\n", (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff); db_printf("ldtr\t0x%04x\n", rldt()); db_printf("tr\t0x%04x\n", rtr()); db_printf("cr0\t0x%08x\n", rcr0()); db_printf("cr2\t0x%08x\n", rcr2()); db_printf("cr3\t0x%08x\n", rcr3()); db_printf("cr4\t0x%08x\n", rcr4()); if (rcr4() & CR4_XSAVE) db_printf("xcr0\t0x%016llx\n", rxcr(0)); if (amd_feature & (AMDID_NX | AMDID_LM)) db_printf("EFER\t0x%016llx\n", rdmsr(MSR_EFER)); if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) db_printf("FEATURES_CTL\t0x%016llx\n", rdmsr(MSR_IA32_FEATURE_CONTROL)); if (((cpu_vendor_id == CPU_VENDOR_INTEL || cpu_vendor_id == CPU_VENDOR_AMD) && CPUID_TO_FAMILY(cpu_id) >= 6) || cpu_vendor_id == CPU_VENDOR_HYGON) db_printf("DEBUG_CTL\t0x%016llx\n", rdmsr(MSR_DEBUGCTLMSR)); if (cpu_feature & CPUID_PAT) db_printf("PAT\t0x%016llx\n", rdmsr(MSR_PAT)); } DB_SHOW_COMMAND(dbregs, db_show_dbregs) { db_printf("dr0\t0x%08x\n", rdr0()); db_printf("dr1\t0x%08x\n", rdr1()); db_printf("dr2\t0x%08x\n", rdr2()); db_printf("dr3\t0x%08x\n", rdr3()); db_printf("dr6\t0x%08x\n", rdr6()); db_printf("dr7\t0x%08x\n", rdr7()); } DB_SHOW_COMMAND(frame, db_show_frame) { struct trapframe *frame; frame = have_addr ? (struct trapframe *)addr : curthread->td_frame; printf("ss %#x esp %#x efl %#x cs %#x eip %#x\n", frame->tf_ss, frame->tf_esp, frame->tf_eflags, frame->tf_cs, frame->tf_eip); printf("err %#x trapno %d\n", frame->tf_err, frame->tf_trapno); printf("ds %#x es %#x fs %#x\n", frame->tf_ds, frame->tf_es, frame->tf_fs); printf("eax %#x ecx %#x edx %#x ebx %#x\n", frame->tf_eax, frame->tf_ecx, frame->tf_edx, frame->tf_ebx); printf("ebp %#x esi %#x edi %#x\n", frame->tf_ebp, frame->tf_esi, frame->tf_edi); } #endif void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, int *physmap_idxp) { uint64_t lim, ign; int i, insert_idx, physmap_idx; physmap_idx = *physmap_idxp; if (length == 0) return (1); lim = 0x100000000; /* 4G */ if (pae_mode && above4g_allow) lim = above24g_allow ? -1ULL : 0x600000000; /* 24G */ if (base >= lim) { printf("%uK of memory above %uGB ignored, pae %d " "above4g_allow %d above24g_allow %d\n", (u_int)(length / 1024), (u_int)(lim >> 30), pae_mode, above4g_allow, above24g_allow); return (1); } if (base + length >= lim) { ign = base + length - lim; length -= ign; printf("%uK of memory above %uGB ignored, pae %d " "above4g_allow %d above24g_allow %d\n", (u_int)(ign / 1024), (u_int)(lim >> 30), pae_mode, above4g_allow, above24g_allow); } /* * Find insertion point while checking for overlap. Start off by * assuming the new entry will be added to the end. */ insert_idx = physmap_idx + 2; for (i = 0; i <= physmap_idx; i += 2) { if (base < physmap[i + 1]) { if (base + length <= physmap[i]) { insert_idx = i; break; } if (boothowto & RB_VERBOSE) printf( "Overlapping memory regions, ignoring second region\n"); return (1); } } /* See if we can prepend to the next entry. */ if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { physmap[insert_idx] = base; return (1); } /* See if we can append to the previous entry. */ if (insert_idx > 0 && base == physmap[insert_idx - 1]) { physmap[insert_idx - 1] += length; return (1); } physmap_idx += 2; *physmap_idxp = physmap_idx; if (physmap_idx == PHYS_AVAIL_ENTRIES) { printf( "Too many segments in the physical address map, giving up\n"); return (0); } /* * Move the last 'N' entries down to make room for the new * entry if needed. */ for (i = physmap_idx; i > insert_idx; i -= 2) { physmap[i] = physmap[i - 2]; physmap[i + 1] = physmap[i - 1]; } /* Insert the new entry. */ physmap[insert_idx] = base; physmap[insert_idx + 1] = base + length; return (1); } static int add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp) { if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%016llx len=%016llx\n", smap->type, smap->base, smap->length); if (smap->type != SMAP_TYPE_MEMORY) return (1); return (add_physmap_entry(smap->base, smap->length, physmap, physmap_idxp)); } static void add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap, int *physmap_idxp) { struct bios_smap *smap, *smapend; u_int32_t smapsize; /* * Memory map from INT 15:E820. * * subr_module.c says: * "Consumer may safely assume that size value precedes data." * ie: an int32_t immediately precedes SMAP. */ smapsize = *((u_int32_t *)smapbase - 1); smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); for (smap = smapbase; smap < smapend; smap++) if (!add_smap_entry(smap, physmap, physmap_idxp)) break; } static void basemem_setup(void) { if (basemem > 640) { printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", basemem); basemem = 640; } pmap_basemem_setup(basemem); } /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * If we cannot accurately determine the physical memory map, then use * value from the 0xE801 call, and failing that, the RTC. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * XXX first should be vm_paddr_t. */ static void getmemsize(int first) { int has_smap, off, physmap_idx, pa_indx, da_indx; u_long memtest; vm_paddr_t physmap[PHYS_AVAIL_ENTRIES]; quad_t dcons_addr, dcons_size, physmem_tunable; int hasbrokenint12, i, res; u_int extmem; struct vm86frame vmf; struct vm86context vmc; vm_paddr_t pa; struct bios_smap *smap, *smapbase; caddr_t kmdp; has_smap = 0; bzero(&vmf, sizeof(vmf)); bzero(physmap, sizeof(physmap)); basemem = 0; /* * Tell the physical memory allocator about pages used to store * the kernel and preloaded data. See kmem_bootstrap_free(). */ vm_phys_early_add_seg((vm_paddr_t)KERNLOAD, trunc_page(first)); TUNABLE_INT_FETCH("hw.above4g_allow", &above4g_allow); TUNABLE_INT_FETCH("hw.above24g_allow", &above24g_allow); /* * Check if the loader supplied an SMAP memory map. If so, * use that and do not make any VM86 calls. */ physmap_idx = 0; kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf32 kernel"); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase != NULL) { add_smap_entries(smapbase, physmap, &physmap_idx); has_smap = 1; goto have_smap; } /* * Some newer BIOSes have a broken INT 12H implementation * which causes a kernel panic immediately. In this case, we * need use the SMAP to determine the base memory size. */ hasbrokenint12 = 0; TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); if (hasbrokenint12 == 0) { /* Use INT12 to determine base memory size. */ vm86_intcall(0x12, &vmf); basemem = vmf.vmf_ax; basemem_setup(); } /* * Fetch the memory map with INT 15:E820. Map page 1 R/W into * the kernel page table so we can use it as a buffer. The * kernel will unmap this page later. */ vmc.npages = 0; smap = (void *)vm86_addpage(&vmc, 1, PMAP_MAP_LOW + ptoa(1)); res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); KASSERT(res != 0, ("vm86_getptr() failed: address not found")); vmf.vmf_ebx = 0; do { vmf.vmf_eax = 0xE820; vmf.vmf_edx = SMAP_SIG; vmf.vmf_ecx = sizeof(struct bios_smap); i = vm86_datacall(0x15, &vmf, &vmc); if (i || vmf.vmf_eax != SMAP_SIG) break; has_smap = 1; if (!add_smap_entry(smap, physmap, &physmap_idx)) break; } while (vmf.vmf_ebx != 0); have_smap: /* * If we didn't fetch the "base memory" size from INT12, * figure it out from the SMAP (or just guess). */ if (basemem == 0) { for (i = 0; i <= physmap_idx; i += 2) { if (physmap[i] == 0x00000000) { basemem = physmap[i + 1] / 1024; break; } } /* XXX: If we couldn't find basemem from SMAP, just guess. */ if (basemem == 0) basemem = 640; basemem_setup(); } if (physmap[1] != 0) goto physmap_done; /* * If we failed to find an SMAP, figure out the extended * memory size. We will then build a simple memory map with * two segments, one for "base memory" and the second for * "extended memory". Note that "extended memory" starts at a * physical address of 1MB and that both basemem and extmem * are in units of 1KB. * * First, try to fetch the extended memory size via INT 15:E801. */ vmf.vmf_ax = 0xE801; if (vm86_intcall(0x15, &vmf) == 0) { extmem = vmf.vmf_cx + vmf.vmf_dx * 64; } else { /* * If INT15:E801 fails, this is our last ditch effort * to determine the extended memory size. Currently * we prefer the RTC value over INT15:88. */ #if 0 vmf.vmf_ah = 0x88; vm86_intcall(0x15, &vmf); extmem = vmf.vmf_ax; #else extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); #endif } /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. * * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) extmem = 15 * 1024; physmap[0] = 0; physmap[1] = basemem * 1024; physmap_idx = 2; physmap[physmap_idx] = 0x100000; physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; physmap_done: /* * Now, physmap contains a map of physical memory. */ #ifdef SMP /* make hole for AP bootstrap code */ alloc_ap_trampoline(physmap, &physmap_idx); #endif /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. * * This is especially confusing when it is much larger than the * memory size and is displayed as "realmem". */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif if (TUNABLE_QUAD_FETCH("hw.physmem", &physmem_tunable)) Maxmem = atop(physmem_tunable); /* * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend * the amount of memory in the system. */ if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) Maxmem = atop(physmap[physmap_idx + 1]); /* * The boot memory test is disabled by default, as it takes a * significant amount of time on large-memory systems, and is * unfriendly to virtual machines as it unnecessarily touches all * pages. * * A general name is used as the code may be extended to support * additional tests beyond the current "page present" test. */ memtest = 0; TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %ldK\n", Maxmem * 4); /* * If Maxmem has been increased beyond what the system has detected, * extend the last memory segment to the new limit. */ if (atop(physmap[physmap_idx + 1]) < Maxmem) physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(first); /* * Size up each available chunk of physical memory. */ physmap[0] = PAGE_SIZE; /* mask off page 0 */ pa_indx = 0; da_indx = 1; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; dump_avail[da_indx] = physmap[0]; /* * Get dcons buffer address */ if (getenv_quad("dcons.addr", &dcons_addr) == 0 || getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ for (i = 0; i <= physmap_idx; i += 2) { vm_paddr_t end; end = ptoa((vm_paddr_t)Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad, full; int *ptr; full = FALSE; /* * block out kernel memory as not available. */ if (pa >= KERNLOAD && pa < first) goto do_dump_avail; /* * block out dcons buffer */ if (dcons_addr > 0 && pa >= trunc_page(dcons_addr) && pa < dcons_addr + dcons_size) goto do_dump_avail; page_bad = FALSE; if (memtest == 0) goto skip_memtest; /* * map page into kernel: valid, read/write,non-cacheable */ ptr = (int *)pmap_cmap3(pa, PG_V | PG_RW | PG_N); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) page_bad = TRUE; /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) page_bad = TRUE; /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) page_bad = TRUE; /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) page_bad = TRUE; /* * Restore original value. */ *(int *)ptr = tmp; skip_memtest: /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) continue; /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ENTRIES) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; full = TRUE; goto do_dump_avail; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; do_dump_avail: if (dump_avail[da_indx] == pa) { dump_avail[da_indx] += PAGE_SIZE; } else { da_indx++; if (da_indx == PHYS_AVAIL_ENTRIES) { da_indx--; goto do_next; } dump_avail[da_indx++] = pa; /* start */ dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ } do_next: if (full) break; } } pmap_cmap3(0, 0); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(msgbufsize) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(msgbufsize); /* Map the message buffer. */ for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + off); } static void i386_kdb_init(void) { #ifdef DDB db_fetch_ksymtab(bootinfo.bi_symtab, bootinfo.bi_esymtab, 0); #endif kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } static void fixup_idt(void) { struct gate_descriptor *ip; uintptr_t off; int x; for (x = 0; x < NIDT; x++) { ip = &idt[x]; if (ip->gd_type != SDT_SYS386IGT && ip->gd_type != SDT_SYS386TGT) continue; off = ip->gd_looffset + (((u_int)ip->gd_hioffset) << 16); KASSERT(off >= (uintptr_t)start_exceptions && off < (uintptr_t)end_exceptions, ("IDT[%d] type %d off %#x", x, ip->gd_type, off)); off += setidt_disp; MPASS(off >= PMAP_TRM_MIN_ADDRESS && off < PMAP_TRM_MAX_ADDRESS); ip->gd_looffset = off; ip->gd_hioffset = off >> 16; } } static void i386_setidt1(void) { int x; /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DE, &IDTVEC(div), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_AC, &IDTVEC(align), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); #ifdef KDTRACE_HOOKS setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); #endif #ifdef XENHVM setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif } static void i386_setidt2(void) { setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); } #if defined(DEV_ISA) && !defined(DEV_ATPIC) static void i386_setidt3(void) { setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); } #endif register_t init386(int first) { struct region_descriptor r_gdt, r_idt; /* table descriptors */ int gsel_tss, metadata_missing, x, pa; struct pcpu *pc; struct xstate_hdr *xhdr; caddr_t kmdp; vm_offset_t addend; size_t ucode_len; int late_console; thread0.td_kstack = proc0kstack; thread0.td_kstack_pages = TD0_KSTACK_PAGES; /* * This may be done better later if it gets more high level * components in it. If so just link td->td_proc here. */ proc_linkup0(&proc0, &thread0); if (bootinfo.bi_modulep) { metadata_missing = 0; addend = (vm_paddr_t)bootinfo.bi_modulep < KERNBASE ? PMAP_MAP_LOW : 0; preload_metadata = (caddr_t)bootinfo.bi_modulep + addend; preload_bootstrap_relocate(addend); } else { metadata_missing = 1; } if (bootinfo.bi_envp != 0) { addend = (vm_paddr_t)bootinfo.bi_envp < KERNBASE ? PMAP_MAP_LOW : 0; init_static_kenv((char *)bootinfo.bi_envp + addend, 0); } else { init_static_kenv(NULL, 0); } /* * Re-evaluate CPU features if we loaded a microcode update. */ ucode_len = ucode_load_bsp(first); if (ucode_len != 0) { identify_cpu(); first = roundup2(first + ucode_len, PAGE_SIZE); } identify_hypervisor(); /* Init basic tunables, hz etc */ init_param1(); /* * Make gdt memory segments. All segments cover the full 4GB * of address space and permissions are enforced at page level. */ gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); pc = &__pcpu[0]; gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); gdt_segs[GPRIV_SEL].ssd_base = (int)pc; gdt_segs[GPROC0_SEL].ssd_base = (int)&common_tss0; for (x = 0; x < NGDT; x++) ssdtosd(&gdt_segs[x], &gdt0[x].sd); r_gdt.rd_limit = NGDT * sizeof(gdt0[0]) - 1; r_gdt.rd_base = (int)gdt0; mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); lgdt(&r_gdt); pcpu_init(pc, 0, sizeof(struct pcpu)); for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) pmap_kenter(pa, pa); dpcpu_init((void *)first, 0); first += DPCPU_SIZE; PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); /* Non-late cninit() and printf() can be moved up to here. */ /* * Initialize mutexes. * * icu_lock: in order to allow an interrupt to occur in a critical * section, to set pcpu->ipending (etc...) properly, we * must be able to get the icu lock, so it can't be * under witness. */ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); i386_setidt1(); r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); /* * Initialize the clock before the console so that console * initialization can use DELAY(). */ clock_init(); finishidentcpu(); /* Final stage of CPU initialization */ i386_setidt2(); pmap_set_nx(); initializecpu(); /* Initialize CPU registers */ initializecpucache(); /* pointer to selector slot for %fs/%gs */ PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); /* Initialize the tss (except for the final esp0) early for vm86. */ common_tss0.tss_esp0 = thread0.td_kstack + thread0.td_kstack_pages * PAGE_SIZE - VM86_STACK_SPACE; common_tss0.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); common_tss0.tss_ioopt = sizeof(struct i386tss) << 16; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); ltr(gsel_tss); /* Initialize the PIC early for vm86 calls. */ #ifdef DEV_ISA #ifdef DEV_ATPIC elcr_probe(); atpic_startup(); #else /* Reset and mask the atpics and leave them shut down. */ atpic_reset(); /* * Point the ICU spurious interrupt vectors at the APIC spurious * interrupt handler. */ i386_setidt3(); #endif #endif /* * The console and kdb should be initialized even earlier than here, * but some console drivers don't work until after getmemsize(). * Default to late console initialization to support these drivers. * This loses mainly printf()s in getmemsize() and early debugging. */ late_console = 1; TUNABLE_INT_FETCH("debug.late_console", &late_console); if (!late_console) { cninit(); i386_kdb_init(); } kmdp = preload_search_by_type("elf kernel"); link_elf_ireloc(kmdp); vm86_initialize(); getmemsize(first); init_param2(physmem); /* now running on new page tables, configured,and u/iom is accessible */ if (late_console) cninit(); if (metadata_missing) printf("WARNING: loader(8) metadata is missing!\n"); if (late_console) i386_kdb_init(); msgbufinit(msgbufp, msgbufsize); npxinit(true); /* * Set up thread0 pcb after npxinit calculated pcb + fpu save * area size. Zero out the extended state header in fpu save * area. */ thread0.td_pcb = get_pcb_td(&thread0); thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); if (use_xsave) { xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1); xhdr->xstate_bv = xsave_mask; } PCPU_SET(curpcb, thread0.td_pcb); /* Move esp0 in the tss to its final place. */ /* Note: -16 is so we can grow the trapframe if we came from vm86 */ common_tss0.tss_esp0 = (vm_offset_t)thread0.td_pcb - VM86_STACK_SPACE; PCPU_SET(kesp0, common_tss0.tss_esp0); gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; /* clear busy bit */ ltr(gsel_tss); /* transfer to user mode */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); _udatasel = GSEL(GUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; thread0.td_pcb->pcb_cr3 = pmap_get_kcr3(); thread0.td_pcb->pcb_ext = 0; thread0.td_frame = &proc0_tf; cpu_probe_amdc1e(); #ifdef FDT x86_init_fdt(); #endif /* Location of kernel stack for locore */ return ((register_t)thread0.td_pcb); } static void machdep_init_trampoline(void) { struct region_descriptor r_gdt, r_idt; struct i386tss *tss; char *copyout_buf, *trampoline, *tramp_stack_base; int x; gdt = pmap_trm_alloc(sizeof(union descriptor) * NGDT * mp_ncpus, M_NOWAIT | M_ZERO); bcopy(gdt0, gdt, sizeof(union descriptor) * NGDT); r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int)gdt; lgdt(&r_gdt); tss = pmap_trm_alloc(sizeof(struct i386tss) * mp_ncpus, M_NOWAIT | M_ZERO); bcopy(&common_tss0, tss, sizeof(struct i386tss)); gdt[GPROC0_SEL].sd.sd_lobase = (int)tss; gdt[GPROC0_SEL].sd.sd_hibase = (u_int)tss >> 24; gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); PCPU_SET(common_tssp, tss); ltr(GSEL(GPROC0_SEL, SEL_KPL)); trampoline = pmap_trm_alloc(end_exceptions - start_exceptions, M_NOWAIT); bcopy(start_exceptions, trampoline, end_exceptions - start_exceptions); tramp_stack_base = pmap_trm_alloc(TRAMP_STACK_SZ, M_NOWAIT); PCPU_SET(trampstk, (uintptr_t)tramp_stack_base + TRAMP_STACK_SZ - VM86_STACK_SPACE); tss[0].tss_esp0 = PCPU_GET(trampstk); idt = pmap_trm_alloc(sizeof(idt0), M_NOWAIT | M_ZERO); bcopy(idt0, idt, sizeof(idt0)); /* Re-initialize new IDT since the handlers were relocated */ setidt_disp = trampoline - start_exceptions; fixup_idt(); r_idt.rd_limit = sizeof(struct gate_descriptor) * NIDT - 1; r_idt.rd_base = (int)idt; lidt(&r_idt); /* dblfault TSS */ dblfault_tss = pmap_trm_alloc(sizeof(struct i386tss), M_NOWAIT | M_ZERO); dblfault_stack = pmap_trm_alloc(PAGE_SIZE, M_NOWAIT); dblfault_tss->tss_esp = dblfault_tss->tss_esp0 = dblfault_tss->tss_esp1 = dblfault_tss->tss_esp2 = (int)dblfault_stack + PAGE_SIZE; dblfault_tss->tss_ss = dblfault_tss->tss_ss0 = dblfault_tss->tss_ss1 = dblfault_tss->tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss->tss_cr3 = pmap_get_kcr3(); dblfault_tss->tss_eip = (int)dblfault_handler; dblfault_tss->tss_eflags = PSL_KERNEL; dblfault_tss->tss_ds = dblfault_tss->tss_es = dblfault_tss->tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss->tss_fs = GSEL(GPRIV_SEL, SEL_KPL); dblfault_tss->tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL); gdt[GPANIC_SEL].sd.sd_lobase = (int)dblfault_tss; gdt[GPANIC_SEL].sd.sd_hibase = (u_int)dblfault_tss >> 24; /* make ldt memory segments */ ldt = pmap_trm_alloc(sizeof(union descriptor) * NLDT, M_NOWAIT | M_ZERO); gdt[GLDT_SEL].sd.sd_lobase = (int)ldt; gdt[GLDT_SEL].sd.sd_hibase = (u_int)ldt >> 24; ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); for (x = 0; x < nitems(ldt_segs); x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); copyout_buf = pmap_trm_alloc(TRAMP_COPYOUT_SZ, M_NOWAIT); PCPU_SET(copyout_buf, copyout_buf); copyout_init_tramp(); } SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_SECOND, machdep_init_trampoline, NULL); #ifdef COMPAT_43 static void i386_setup_lcall_gate(void) { struct sysentvec *sv; struct user_segment_descriptor desc; u_int lcall_addr; sv = &elf32_freebsd_sysvec; lcall_addr = (uintptr_t)sv->sv_psstrings - sz_lcall_tramp; bzero(&desc, sizeof(desc)); desc.sd_type = SDT_MEMERA; desc.sd_dpl = SEL_UPL; desc.sd_p = 1; desc.sd_def32 = 1; desc.sd_gran = 1; desc.sd_lolimit = 0xffff; desc.sd_hilimit = 0xf; desc.sd_lobase = lcall_addr; desc.sd_hibase = lcall_addr >> 24; bcopy(&desc, &ldt[LSYS5CALLS_SEL], sizeof(desc)); } SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_ANY, i386_setup_lcall_gate, NULL); #endif void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = 0xffffffff; } static int smap_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct bios_smap *smapbase; struct bios_smap_xattr smap; caddr_t kmdp; uint32_t *smapattr; int count, error, i; /* Retrieve the system memory map from the loader. */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf32 kernel"); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase == NULL) return (0); smapattr = (uint32_t *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP_XATTR); count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase); error = 0; for (i = 0; i < count; i++) { smap.base = smapbase[i].base; smap.length = smapbase[i].length; smap.type = smapbase[i].type; if (smapattr != NULL) smap.xattr = smapattr[i]; else smap.xattr = 0; error = SYSCTL_OUT(req, &smap, sizeof(smap)); } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); void spinlock_enter(void) { struct thread *td; register_t flags; td = curthread; if (td->td_md.md_spinlock_count == 0) { flags = intr_disable(); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_flags = flags; critical_enter(); } else td->td_md.md_spinlock_count++; } void spinlock_exit(void) { struct thread *td; register_t flags; td = curthread; flags = td->td_md.md_saved_flags; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) { critical_exit(); intr_restore(flags); } } #if defined(I586_CPU) && !defined(NO_F00F_HACK) static void f00f_hack(void *unused); SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); static void f00f_hack(void *unused) { struct region_descriptor r_idt; struct gate_descriptor *new_idt; vm_offset_t tmp; if (!has_f00f_bug) return; GIANT_REQUIRED; printf("Intel Pentium detected, installing workaround for F00F bug\n"); tmp = (vm_offset_t)pmap_trm_alloc(PAGE_SIZE * 3, M_NOWAIT | M_ZERO); if (tmp == 0) panic("kmem_malloc returned 0"); tmp = round_page(tmp); /* Put the problematic entry (#6) at the end of the lower page. */ new_idt = (struct gate_descriptor *) (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); bcopy(idt, new_idt, sizeof(idt0)); r_idt.rd_base = (u_int)new_idt; r_idt.rd_limit = sizeof(idt0) - 1; lidt(&r_idt); /* SMP machines do not need the F00F hack. */ idt = new_idt; pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ); } #endif /* defined(I586_CPU) && !NO_F00F_HACK */ /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_edi = tf->tf_edi; pcb->pcb_esi = tf->tf_esi; pcb->pcb_ebp = tf->tf_ebp; pcb->pcb_ebx = tf->tf_ebx; pcb->pcb_eip = tf->tf_eip; pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; pcb->pcb_gs = rgs(); } int ptrace_set_pc(struct thread *td, u_long addr) { td->td_frame->tf_eip = addr; return (0); } int ptrace_single_step(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); if ((td->td_frame->tf_eflags & PSL_T) == 0) { td->td_frame->tf_eflags |= PSL_T; td->td_dbgflags |= TDB_STEP; } return (0); } int ptrace_clear_single_step(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); td->td_frame->tf_eflags &= ~PSL_T; td->td_dbgflags &= ~TDB_STEP; return (0); } int fill_regs(struct thread *td, struct reg *regs) { struct pcb *pcb; struct trapframe *tp; tp = td->td_frame; pcb = td->td_pcb; regs->r_gs = pcb->pcb_gs; return (fill_frame_regs(tp, regs)); } int fill_frame_regs(struct trapframe *tp, struct reg *regs) { regs->r_fs = tp->tf_fs; regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; regs->r_err = 0; regs->r_trapno = 0; return (0); } int set_regs(struct thread *td, struct reg *regs) { struct pcb *pcb; struct trapframe *tp; tp = td->td_frame; if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); pcb = td->td_pcb; tp->tf_fs = regs->r_fs; tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; pcb->pcb_gs = regs->r_gs; return (0); } int fill_fpregs(struct thread *td, struct fpreg *fpregs) { KASSERT(td == curthread || TD_IS_SUSPENDED(td) || P_SHOULDSTOP(td->td_proc), ("not suspended thread %p", td)); npxgetregs(td); if (cpu_fxsr) npx_fill_fpregs_xmm(&get_pcb_user_save_td(td)->sv_xmm, (struct save87 *)fpregs); else bcopy(&get_pcb_user_save_td(td)->sv_87, fpregs, sizeof(*fpregs)); return (0); } int set_fpregs(struct thread *td, struct fpreg *fpregs) { critical_enter(); if (cpu_fxsr) npx_set_fpregs_xmm((struct save87 *)fpregs, &get_pcb_user_save_td(td)->sv_xmm); else bcopy(fpregs, &get_pcb_user_save_td(td)->sv_87, sizeof(*fpregs)); npxuserinited(td); critical_exit(); return (0); } /* * Get machine context. */ int get_mcontext(struct thread *td, mcontext_t *mcp, int flags) { struct trapframe *tp; struct segment_descriptor *sdp; tp = td->td_frame; PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(tp->tf_esp); PROC_UNLOCK(curthread->td_proc); mcp->mc_gs = td->td_pcb->pcb_gs; mcp->mc_fs = tp->tf_fs; mcp->mc_es = tp->tf_es; mcp->mc_ds = tp->tf_ds; mcp->mc_edi = tp->tf_edi; mcp->mc_esi = tp->tf_esi; mcp->mc_ebp = tp->tf_ebp; mcp->mc_isp = tp->tf_isp; mcp->mc_eflags = tp->tf_eflags; if (flags & GET_MC_CLEAR_RET) { mcp->mc_eax = 0; mcp->mc_edx = 0; mcp->mc_eflags &= ~PSL_C; } else { mcp->mc_eax = tp->tf_eax; mcp->mc_edx = tp->tf_edx; } mcp->mc_ebx = tp->tf_ebx; mcp->mc_ecx = tp->tf_ecx; mcp->mc_eip = tp->tf_eip; mcp->mc_cs = tp->tf_cs; mcp->mc_esp = tp->tf_esp; mcp->mc_ss = tp->tf_ss; mcp->mc_len = sizeof(*mcp); get_fpcontext(td, mcp, NULL, 0); sdp = &td->td_pcb->pcb_fsd; mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; sdp = &td->td_pcb->pcb_gsd; mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; mcp->mc_flags = 0; mcp->mc_xfpustate = 0; mcp->mc_xfpustate_len = 0; bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2)); return (0); } /* * Set machine context. * * However, we don't set any but the user modifiable flags, and we won't * touch the cs selector. */ int set_mcontext(struct thread *td, mcontext_t *mcp) { struct trapframe *tp; char *xfpustate; int eflags, ret; tp = td->td_frame; if (mcp->mc_len != sizeof(*mcp) || (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) return (EINVAL); eflags = (mcp->mc_eflags & PSL_USERCHANGE) | (tp->tf_eflags & ~PSL_USERCHANGE); if (mcp->mc_flags & _MC_HASFPXSTATE) { if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - sizeof(union savefpu)) return (EINVAL); xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); ret = copyin((void *)mcp->mc_xfpustate, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); } else xfpustate = NULL; ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); tp->tf_fs = mcp->mc_fs; tp->tf_es = mcp->mc_es; tp->tf_ds = mcp->mc_ds; tp->tf_edi = mcp->mc_edi; tp->tf_esi = mcp->mc_esi; tp->tf_ebp = mcp->mc_ebp; tp->tf_ebx = mcp->mc_ebx; tp->tf_edx = mcp->mc_edx; tp->tf_ecx = mcp->mc_ecx; tp->tf_eax = mcp->mc_eax; tp->tf_eip = mcp->mc_eip; tp->tf_eflags = eflags; tp->tf_esp = mcp->mc_esp; tp->tf_ss = mcp->mc_ss; td->td_pcb->pcb_gs = mcp->mc_gs; return (0); } static void get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, size_t xfpusave_len) { size_t max_len, len; mcp->mc_ownedfp = npxgetregs(td); bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], sizeof(mcp->mc_fpstate)); mcp->mc_fpformat = npxformat(); if (!use_xsave || xfpusave_len == 0) return; max_len = cpu_max_ext_state_size - sizeof(union savefpu); len = xfpusave_len; if (len > max_len) { len = max_len; bzero(xfpusave + max_len, len - max_len); } mcp->mc_flags |= _MC_HASFPXSTATE; mcp->mc_xfpustate_len = len; bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); } static int set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, size_t xfpustate_len) { int error; if (mcp->mc_fpformat == _MC_FPFMT_NODEV) return (0); else if (mcp->mc_fpformat != _MC_FPFMT_387 && mcp->mc_fpformat != _MC_FPFMT_XMM) return (EINVAL); else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { /* We don't care what state is left in the FPU or PCB. */ fpstate_drop(td); error = 0; } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || mcp->mc_ownedfp == _MC_FPOWNED_PCB) { error = npxsetregs(td, (union savefpu *)&mcp->mc_fpstate, xfpustate, xfpustate_len); } else return (EINVAL); return (error); } static void fpstate_drop(struct thread *td) { KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); critical_enter(); if (PCPU_GET(fpcurthread) == td) npxdrop(); /* * XXX force a full drop of the npx. The above only drops it if we * owned it. npxgetregs() has the same bug in the !cpu_fxsr case. * * XXX I don't much like npxgetregs()'s semantics of doing a full * drop. Dropping only to the pcb matches fnsave's behaviour. * We only need to drop to !PCB_INITDONE in sendsig(). But * sendsig() is the only caller of npxgetregs()... perhaps we just * have too many layers. */ curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE | PCB_NPXUSERINITDONE); critical_exit(); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; if (td == NULL) { dbregs->dr[0] = rdr0(); dbregs->dr[1] = rdr1(); dbregs->dr[2] = rdr2(); dbregs->dr[3] = rdr3(); dbregs->dr[6] = rdr6(); dbregs->dr[7] = rdr7(); } else { pcb = td->td_pcb; dbregs->dr[0] = pcb->pcb_dr0; dbregs->dr[1] = pcb->pcb_dr1; dbregs->dr[2] = pcb->pcb_dr2; dbregs->dr[3] = pcb->pcb_dr3; dbregs->dr[6] = pcb->pcb_dr6; dbregs->dr[7] = pcb->pcb_dr7; } dbregs->dr[4] = 0; dbregs->dr[5] = 0; return (0); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; int i; if (td == NULL) { load_dr0(dbregs->dr[0]); load_dr1(dbregs->dr[1]); load_dr2(dbregs->dr[2]); load_dr3(dbregs->dr[3]); load_dr6(dbregs->dr[6]); load_dr7(dbregs->dr[7]); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP. */ for (i = 0; i < 4; i++) { if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) return (EINVAL); if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02) return (EINVAL); } pcb = td->td_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { /* dr0 is enabled */ if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { /* dr1 is enabled */ if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { /* dr2 is enabled */ if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { /* dr3 is enabled */ if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) return (EINVAL); } pcb->pcb_dr0 = dbregs->dr[0]; pcb->pcb_dr1 = dbregs->dr[1]; pcb->pcb_dr2 = dbregs->dr[2]; pcb->pcb_dr3 = dbregs->dr[3]; pcb->pcb_dr6 = dbregs->dr[6]; pcb->pcb_dr7 = dbregs->dr[7]; pcb->pcb_flags |= PCB_DBREGS; } return (0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(register_t dr6) { u_int32_t dr7; u_int32_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; bp = dr6 & DBREG_DR6_BMASK; if (bp == 0) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return 0; } dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return 0; } nbp = 0; /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i = 0; i < nbp; i++) { if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { /* * addr[i] is in user space */ return nbp; } } /* * None of the breakpoints are in user space. */ return 0; } #ifdef KDB /* * Provide inb() and outb() as functions. They are normally only available as * inline functions, thus cannot be called from the debugger. */ /* silence compiler warnings */ u_char inb_(u_short); void outb_(u_short, u_char); u_char inb_(u_short port) { return inb(port); } void outb_(u_short port, u_char data) { outb(port, data); } #endif /* KDB */ Index: head/sys/i386/i386/minidump_machdep_base.c =================================================================== --- head/sys/i386/i386/minidump_machdep_base.c (revision 366710) +++ head/sys/i386/i386/minidump_machdep_base.c (revision 366711) @@ -1,356 +1,356 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include -#include #include CTASSERT(sizeof(struct kerneldumpheader) == 512); #define MD_ALIGN(x) (((off_t)(x) + PAGE_MASK) & ~PAGE_MASK) #define DEV_ALIGN(x) roundup2((off_t)(x), DEV_BSIZE) static struct kerneldumpheader kdh; /* Handle chunked writes. */ static size_t fragsz; static void *dump_va; static uint64_t counter, progress; static int is_dumpable(vm_paddr_t pa) { int i; for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) return (1); } return (0); } #define PG2MB(pgs) (((pgs) + (1 << 8) - 1) >> 8) static int blk_flush(struct dumperinfo *di) { int error; if (fragsz == 0) return (0); error = dump_append(di, dump_va, 0, fragsz); fragsz = 0; return (error); } static int blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz) { size_t len; int error, i, c; u_int maxdumpsz; maxdumpsz = min(di->maxiosize, MAXDUMPPGS * PAGE_SIZE); if (maxdumpsz == 0) /* seatbelt */ maxdumpsz = PAGE_SIZE; error = 0; if ((sz % PAGE_SIZE) != 0) { printf("size not page aligned\n"); return (EINVAL); } if (ptr != NULL && pa != 0) { printf("cant have both va and pa!\n"); return (EINVAL); } if (pa != 0 && (((uintptr_t)ptr) % PAGE_SIZE) != 0) { printf("address not page aligned\n"); return (EINVAL); } if (ptr != NULL) { /* If we're doing a virtual dump, flush any pre-existing pa pages */ error = blk_flush(di); if (error) return (error); } while (sz) { len = maxdumpsz - fragsz; if (len > sz) len = sz; counter += len; progress -= len; if (counter >> 24) { printf(" %lld", PG2MB(progress >> PAGE_SHIFT)); counter &= (1<<24) - 1; } wdog_kern_pat(WD_LASTVAL); if (ptr) { error = dump_append(di, ptr, 0, len); if (error) return (error); ptr += len; sz -= len; } else { for (i = 0; i < len; i += PAGE_SIZE) dump_va = pmap_kenter_temporary(pa + i, (i + fragsz) >> PAGE_SHIFT); fragsz += len; pa += len; sz -= len; if (fragsz == maxdumpsz) { error = blk_flush(di); if (error) return (error); } } /* Check for user abort. */ c = cncheckc(); if (c == 0x03) return (ECANCELED); if (c != -1) printf(" (CTRL-C to abort) "); } return (0); } /* A fake page table page, to avoid having to handle both 4K and 2M pages */ static pt_entry_t fakept[NPTEPG]; #ifdef PMAP_PAE_COMP #define minidumpsys minidumpsys_pae #define IdlePTD IdlePTD_pae #else #define minidumpsys minidumpsys_nopae #define IdlePTD IdlePTD_nopae #endif int minidumpsys(struct dumperinfo *di) { uint64_t dumpsize; uint32_t ptesize; vm_offset_t va; int error; uint64_t pa; pd_entry_t *pd; pt_entry_t *pt; int j, k; struct minidumphdr mdhdr; counter = 0; /* Walk page table pages, set bits in vm_page_dump */ ptesize = 0; for (va = KERNBASE; va < kernel_vm_end; va += NBPDR) { /* * We always write a page, even if it is zero. Each * page written corresponds to 2MB of space */ ptesize += PAGE_SIZE; pd = IdlePTD; /* always mapped! */ j = va >> PDRSHIFT; if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V)) { /* This is an entire 2M page. */ pa = pd[j] & PG_PS_FRAME; for (k = 0; k < NPTEPG; k++) { if (is_dumpable(pa)) dump_add_page(pa); pa += PAGE_SIZE; } continue; } if ((pd[j] & PG_V) == PG_V) { /* set bit for each valid page in this 2MB block */ pt = pmap_kenter_temporary(pd[j] & PG_FRAME, 0); for (k = 0; k < NPTEPG; k++) { if ((pt[k] & PG_V) == PG_V) { pa = pt[k] & PG_FRAME; if (is_dumpable(pa)) dump_add_page(pa); } } } else { /* nothing, we're going to dump a null page */ } } /* Calculate dump size. */ dumpsize = ptesize; dumpsize += round_page(msgbufp->msg_size); dumpsize += round_page(sizeof(dump_avail)); dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages)); VM_PAGE_DUMP_FOREACH(pa) { /* Clear out undumpable pages now if needed */ if (is_dumpable(pa)) { dumpsize += PAGE_SIZE; } else { dump_drop_page(pa); } } dumpsize += PAGE_SIZE; progress = dumpsize; /* Initialize mdhdr */ bzero(&mdhdr, sizeof(mdhdr)); strcpy(mdhdr.magic, MINIDUMP_MAGIC); mdhdr.version = MINIDUMP_VERSION; mdhdr.msgbufsize = msgbufp->msg_size; mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); mdhdr.ptesize = ptesize; mdhdr.kernbase = KERNBASE; mdhdr.paemode = pae_mode; mdhdr.dumpavailsize = round_page(sizeof(dump_avail)); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_I386_VERSION, dumpsize); error = dump_start(di, &kdh); if (error != 0) goto fail; printf("Physical memory: %ju MB\n", ptoa((uintmax_t)physmem) / 1048576); printf("Dumping %llu MB:", (long long)dumpsize >> 20); /* Dump my header */ bzero(&fakept, sizeof(fakept)); bcopy(&mdhdr, &fakept, sizeof(mdhdr)); error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE); if (error) goto fail; /* Dump msgbuf up front */ error = blk_write(di, (char *)msgbufp->msg_ptr, 0, round_page(msgbufp->msg_size)); if (error) goto fail; /* Dump dump_avail */ _Static_assert(sizeof(dump_avail) <= sizeof(fakept), "Large dump_avail not handled"); bzero(fakept, sizeof(fakept)); memcpy(fakept, dump_avail, sizeof(dump_avail)); error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE); if (error) goto fail; /* Dump bitmap */ error = blk_write(di, (char *)vm_page_dump, 0, round_page(BITSET_SIZE(vm_page_dump_pages))); if (error) goto fail; /* Dump kernel page table pages */ for (va = KERNBASE; va < kernel_vm_end; va += NBPDR) { /* We always write a page, even if it is zero */ pd = IdlePTD; /* always mapped! */ j = va >> PDRSHIFT; if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V)) { /* This is a single 2M block. Generate a fake PTP */ pa = pd[j] & PG_PS_FRAME; for (k = 0; k < NPTEPG; k++) { fakept[k] = (pa + (k * PAGE_SIZE)) | PG_V | PG_RW | PG_A | PG_M; } error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE); if (error) goto fail; /* flush, in case we reuse fakept in the same block */ error = blk_flush(di); if (error) goto fail; continue; } if ((pd[j] & PG_V) == PG_V) { pa = pd[j] & PG_FRAME; error = blk_write(di, 0, pa, PAGE_SIZE); if (error) goto fail; } else { bzero(fakept, sizeof(fakept)); error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE); if (error) goto fail; /* flush, in case we reuse fakept in the same block */ error = blk_flush(di); if (error) goto fail; } } /* Dump memory chunks */ VM_PAGE_DUMP_FOREACH(pa) { error = blk_write(di, 0, pa, PAGE_SIZE); if (error) goto fail; } error = blk_flush(di); if (error) goto fail; error = dump_finish(di, &kdh); if (error != 0) goto fail; printf("\nDump complete\n"); return (0); fail: if (error < 0) error = -error; if (error == ECANCELED) printf("\nDump aborted\n"); else if (error == E2BIG || error == ENOSPC) { printf("\nDump failed. Partition too small (about %lluMB were " "needed this time).\n", (long long)dumpsize >> 20); } else printf("\n** DUMP FAILED (ERROR %d) **\n", error); return (error); } Index: head/sys/kern/kern_dump.c =================================================================== --- head/sys/kern/kern_dump.c (revision 366710) +++ head/sys/kern/kern_dump.c (revision 366711) @@ -1,386 +1,387 @@ /*- * Copyright (c) 2002 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include CTASSERT(sizeof(struct kerneldumpheader) == 512); #define MD_ALIGN(x) roundup2((off_t)(x), PAGE_SIZE) /* Handle buffered writes. */ static size_t fragsz; struct dump_pa dump_map[DUMPSYS_MD_PA_NPAIRS]; #if !defined(__powerpc__) void dumpsys_gen_pa_init(void) { int n, idx; bzero(dump_map, sizeof(dump_map)); for (n = 0; n < nitems(dump_map); n++) { idx = n * 2; if (dump_avail[idx] == 0 && dump_avail[idx + 1] == 0) break; dump_map[n].pa_start = dump_avail[idx]; dump_map[n].pa_size = dump_avail[idx + 1] - dump_avail[idx]; } } #endif struct dump_pa * dumpsys_gen_pa_next(struct dump_pa *mdp) { if (mdp == NULL) return (&dump_map[0]); mdp++; if (mdp->pa_size == 0) mdp = NULL; return (mdp); } void dumpsys_gen_wbinv_all(void) { } void dumpsys_gen_unmap_chunk(vm_paddr_t pa __unused, size_t chunk __unused, void *va __unused) { } int dumpsys_gen_write_aux_headers(struct dumperinfo *di) { return (0); } int dumpsys_buf_seek(struct dumperinfo *di, size_t sz) { static uint8_t buf[DEV_BSIZE]; size_t nbytes; int error; bzero(buf, sizeof(buf)); while (sz > 0) { nbytes = MIN(sz, sizeof(buf)); error = dump_append(di, buf, 0, nbytes); if (error) return (error); sz -= nbytes; } return (0); } int dumpsys_buf_write(struct dumperinfo *di, char *ptr, size_t sz) { size_t len; int error; while (sz) { len = di->blocksize - fragsz; if (len > sz) len = sz; memcpy((char *)di->blockbuf + fragsz, ptr, len); fragsz += len; ptr += len; sz -= len; if (fragsz == di->blocksize) { error = dump_append(di, di->blockbuf, 0, di->blocksize); if (error) return (error); fragsz = 0; } } return (0); } int dumpsys_buf_flush(struct dumperinfo *di) { int error; if (fragsz == 0) return (0); error = dump_append(di, di->blockbuf, 0, di->blocksize); fragsz = 0; return (error); } CTASSERT(PAGE_SHIFT < 20); #define PG2MB(pgs) ((pgs + (1 << (20 - PAGE_SHIFT)) - 1) >> (20 - PAGE_SHIFT)) int dumpsys_cb_dumpdata(struct dump_pa *mdp, int seqnr, void *arg) { struct dumperinfo *di = (struct dumperinfo*)arg; vm_paddr_t pa; void *va; uint64_t pgs; size_t counter, sz, chunk; int c, error; u_int maxdumppgs; error = 0; /* catch case in which chunk size is 0 */ counter = 0; /* Update twiddle every 16MB */ va = NULL; pgs = mdp->pa_size / PAGE_SIZE; pa = mdp->pa_start; maxdumppgs = min(di->maxiosize / PAGE_SIZE, MAXDUMPPGS); if (maxdumppgs == 0) /* seatbelt */ maxdumppgs = 1; printf(" chunk %d: %juMB (%ju pages)", seqnr, (uintmax_t)PG2MB(pgs), (uintmax_t)pgs); dumpsys_wbinv_all(); while (pgs) { chunk = pgs; if (chunk > maxdumppgs) chunk = maxdumppgs; sz = chunk << PAGE_SHIFT; counter += sz; if (counter >> 24) { printf(" %ju", (uintmax_t)PG2MB(pgs)); counter &= (1 << 24) - 1; } dumpsys_map_chunk(pa, chunk, &va); wdog_kern_pat(WD_LASTVAL); error = dump_append(di, va, 0, sz); dumpsys_unmap_chunk(pa, chunk, va); if (error) break; pgs -= chunk; pa += sz; /* Check for user abort. */ c = cncheckc(); if (c == 0x03) return (ECANCELED); if (c != -1) printf(" (CTRL-C to abort) "); } printf(" ... %s\n", (error) ? "fail" : "ok"); return (error); } int dumpsys_foreach_chunk(dumpsys_callback_t cb, void *arg) { struct dump_pa *mdp; int error, seqnr; seqnr = 0; mdp = dumpsys_pa_next(NULL); while (mdp != NULL) { error = (*cb)(mdp, seqnr++, arg); if (error) return (-error); mdp = dumpsys_pa_next(mdp); } return (seqnr); } static off_t fileofs; static int cb_dumphdr(struct dump_pa *mdp, int seqnr, void *arg) { struct dumperinfo *di = (struct dumperinfo*)arg; Elf_Phdr phdr; uint64_t size; int error; size = mdp->pa_size; bzero(&phdr, sizeof(phdr)); phdr.p_type = PT_LOAD; phdr.p_flags = PF_R; /* XXX */ phdr.p_offset = fileofs; #ifdef __powerpc__ phdr.p_vaddr = (do_minidump? mdp->pa_start : ~0L); phdr.p_paddr = (do_minidump? ~0L : mdp->pa_start); #else phdr.p_vaddr = mdp->pa_start; phdr.p_paddr = mdp->pa_start; #endif phdr.p_filesz = size; phdr.p_memsz = size; phdr.p_align = PAGE_SIZE; error = dumpsys_buf_write(di, (char*)&phdr, sizeof(phdr)); fileofs += phdr.p_filesz; return (error); } static int cb_size(struct dump_pa *mdp, int seqnr, void *arg) { uint64_t *sz; sz = (uint64_t *)arg; *sz += (uint64_t)mdp->pa_size; return (0); } int dumpsys_generic(struct dumperinfo *di) { static struct kerneldumpheader kdh; Elf_Ehdr ehdr; uint64_t dumpsize; off_t hdrgap; size_t hdrsz; int error; #if !defined(__powerpc__) || defined(__powerpc64__) if (do_minidump) return (minidumpsys(di)); #endif bzero(&ehdr, sizeof(ehdr)); ehdr.e_ident[EI_MAG0] = ELFMAG0; ehdr.e_ident[EI_MAG1] = ELFMAG1; ehdr.e_ident[EI_MAG2] = ELFMAG2; ehdr.e_ident[EI_MAG3] = ELFMAG3; ehdr.e_ident[EI_CLASS] = ELF_CLASS; #if BYTE_ORDER == LITTLE_ENDIAN ehdr.e_ident[EI_DATA] = ELFDATA2LSB; #else ehdr.e_ident[EI_DATA] = ELFDATA2MSB; #endif ehdr.e_ident[EI_VERSION] = EV_CURRENT; ehdr.e_ident[EI_OSABI] = ELFOSABI_STANDALONE; /* XXX big picture? */ ehdr.e_type = ET_CORE; ehdr.e_machine = EM_VALUE; ehdr.e_phoff = sizeof(ehdr); ehdr.e_flags = 0; ehdr.e_ehsize = sizeof(ehdr); ehdr.e_phentsize = sizeof(Elf_Phdr); ehdr.e_shentsize = sizeof(Elf_Shdr); dumpsys_pa_init(); /* Calculate dump size. */ dumpsize = 0L; ehdr.e_phnum = dumpsys_foreach_chunk(cb_size, &dumpsize) + DUMPSYS_NUM_AUX_HDRS; hdrsz = ehdr.e_phoff + ehdr.e_phnum * ehdr.e_phentsize; fileofs = MD_ALIGN(hdrsz); dumpsize += fileofs; hdrgap = fileofs - roundup2((off_t)hdrsz, di->blocksize); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_ARCH_VERSION, dumpsize); error = dump_start(di, &kdh); if (error != 0) goto fail; printf("Dumping %ju MB (%d chunks)\n", (uintmax_t)dumpsize >> 20, ehdr.e_phnum - DUMPSYS_NUM_AUX_HDRS); /* Dump ELF header */ error = dumpsys_buf_write(di, (char*)&ehdr, sizeof(ehdr)); if (error) goto fail; /* Dump program headers */ error = dumpsys_foreach_chunk(cb_dumphdr, di); if (error < 0) goto fail; error = dumpsys_write_aux_headers(di); if (error < 0) goto fail; dumpsys_buf_flush(di); /* * All headers are written using blocked I/O, so we know the * current offset is (still) block aligned. Skip the alignement * in the file to have the segment contents aligned at page * boundary. */ error = dumpsys_buf_seek(di, (size_t)hdrgap); if (error) goto fail; /* Dump memory chunks. */ error = dumpsys_foreach_chunk(dumpsys_cb_dumpdata, di); if (error < 0) goto fail; error = dump_finish(di, &kdh); if (error != 0) goto fail; printf("\nDump complete\n"); return (0); fail: if (error < 0) error = -error; if (error == ECANCELED) printf("\nDump aborted\n"); else if (error == E2BIG || error == ENOSPC) printf("\nDump failed. Partition too small.\n"); else printf("\n** DUMP FAILED (ERROR %d) **\n", error); return (error); } Index: head/sys/kern/subr_physmem.c =================================================================== --- head/sys/kern/subr_physmem.c (revision 366710) +++ head/sys/kern/subr_physmem.c (revision 366711) @@ -1,399 +1,400 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Ian Lepore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include "opt_ddb.h" /* * Routines for describing and initializing anything related to physical memory. */ #include #include #include #include #include #include #include +#include #include /* * These structures are used internally to keep track of regions of physical * ram, and regions within the physical ram that need to be excluded. An * exclusion region can be excluded from crash dumps, from the vm pool of pages * that can be allocated, or both, depending on the exclusion flags associated * with the region. */ #ifdef DEV_ACPI #define MAX_HWCNT 32 /* ACPI needs more regions */ #define MAX_EXCNT 32 #else #define MAX_HWCNT 16 #define MAX_EXCNT 16 #endif #if defined(__arm__) #define MAX_PHYS_ADDR 0xFFFFFFFFull #elif defined(__aarch64__) || defined(__riscv) #define MAX_PHYS_ADDR 0xFFFFFFFFFFFFFFFFull #endif struct region { vm_paddr_t addr; vm_size_t size; uint32_t flags; }; static struct region hwregions[MAX_HWCNT]; static struct region exregions[MAX_EXCNT]; static size_t hwcnt; static size_t excnt; /* * realmem is the total number of hardware pages, excluded or not. * Maxmem is one greater than the last physical page number. */ long realmem; long Maxmem; /* * Print the contents of the physical and excluded region tables using the * provided printf-like output function (which will be either printf or * db_printf). */ static void physmem_dump_tables(int (*prfunc)(const char *, ...)) { int flags, i; uintmax_t addr, size; const unsigned int mbyte = 1024 * 1024; prfunc("Physical memory chunk(s):\n"); for (i = 0; i < hwcnt; ++i) { addr = hwregions[i].addr; size = hwregions[i].size; prfunc(" 0x%08jx - 0x%08jx, %5ju MB (%7ju pages)\n", addr, addr + size - 1, size / mbyte, size / PAGE_SIZE); } prfunc("Excluded memory regions:\n"); for (i = 0; i < excnt; ++i) { addr = exregions[i].addr; size = exregions[i].size; flags = exregions[i].flags; prfunc(" 0x%08jx - 0x%08jx, %5ju MB (%7ju pages) %s %s\n", addr, addr + size - 1, size / mbyte, size / PAGE_SIZE, (flags & EXFLAG_NOALLOC) ? "NoAlloc" : "", (flags & EXFLAG_NODUMP) ? "NoDump" : ""); } #ifdef DEBUG prfunc("Avail lists:\n"); for (i = 0; phys_avail[i] != 0; ++i) { prfunc(" phys_avail[%d] 0x%08x\n", i, phys_avail[i]); } for (i = 0; dump_avail[i] != 0; ++i) { prfunc(" dump_avail[%d] 0x%08x\n", i, dump_avail[i]); } #endif } /* * Print the contents of the static mapping table. Used for bootverbose. */ void physmem_print_tables(void) { physmem_dump_tables(printf); } /* * Walk the list of hardware regions, processing it against the list of * exclusions that contain the given exflags, and generating an "avail list". * * Updates the value at *pavail with the sum of all pages in all hw regions. * * Returns the number of pages of non-excluded memory added to the avail list. */ static size_t regions_to_avail(vm_paddr_t *avail, uint32_t exflags, size_t maxavail, long *pavail, long *prealmem) { size_t acnt, exi, hwi; uint64_t end, start, xend, xstart; long availmem, totalmem; const struct region *exp, *hwp; totalmem = 0; availmem = 0; acnt = 0; for (hwi = 0, hwp = hwregions; hwi < hwcnt; ++hwi, ++hwp) { start = hwp->addr; end = hwp->size + start; totalmem += atop((vm_offset_t)(end - start)); for (exi = 0, exp = exregions; exi < excnt; ++exi, ++exp) { /* * If the excluded region does not match given flags, * continue checking with the next excluded region. */ if ((exp->flags & exflags) == 0) continue; xstart = exp->addr; xend = exp->size + xstart; /* * If the excluded region ends before this hw region, * continue checking with the next excluded region. */ if (xend <= start) continue; /* * If the excluded region begins after this hw region * we're done because both lists are sorted. */ if (xstart >= end) break; /* * If the excluded region completely covers this hw * region, shrink this hw region to zero size. */ if ((start >= xstart) && (end <= xend)) { start = xend; end = xend; break; } /* * If the excluded region falls wholly within this hw * region without abutting or overlapping the beginning * or end, create an available entry from the leading * fragment, then adjust the start of this hw region to * the end of the excluded region, and continue checking * the next excluded region because another exclusion * could affect the remainder of this hw region. */ if ((xstart > start) && (xend < end)) { if (acnt > 0 && avail[acnt - 1] == (vm_paddr_t)start) { avail[acnt - 1] = (vm_paddr_t)xstart; } else { avail[acnt++] = (vm_paddr_t)start; avail[acnt++] = (vm_paddr_t)xstart; } availmem += atop((vm_offset_t)(xstart - start)); start = xend; continue; } /* * We know the excluded region overlaps either the start * or end of this hardware region (but not both), trim * the excluded portion off the appropriate end. */ if (xstart <= start) start = xend; else end = xstart; } /* * If the trimming actions above left a non-zero size, create an * available entry for it. */ if (end > start) { if (acnt > 0 && avail[acnt - 1] == (vm_paddr_t)start) { avail[acnt - 1] = (vm_paddr_t)end; } else { avail[acnt++] = (vm_paddr_t)start; avail[acnt++] = (vm_paddr_t)end; } availmem += atop((vm_offset_t)(end - start)); } if (acnt >= maxavail) panic("Not enough space in the dump/phys_avail arrays"); } if (pavail != NULL) *pavail = availmem; if (prealmem != NULL) *prealmem = totalmem; return (acnt); } /* * Insertion-sort a new entry into a regions list; sorted by start address. */ static size_t insert_region(struct region *regions, size_t rcnt, vm_paddr_t addr, vm_size_t size, uint32_t flags) { size_t i; struct region *ep, *rp; ep = regions + rcnt; for (i = 0, rp = regions; i < rcnt; ++i, ++rp) { if (rp->addr == addr && rp->size == size) /* Pure dup. */ return (rcnt); if (flags == rp->flags) { if (addr + size == rp->addr) { rp->addr = addr; rp->size += size; return (rcnt); } else if (rp->addr + rp->size == addr) { rp->size += size; return (rcnt); } } if (addr < rp->addr) { bcopy(rp, rp + 1, (ep - rp) * sizeof(*rp)); break; } } rp->addr = addr; rp->size = size; rp->flags = flags; rcnt++; return (rcnt); } /* * Add a hardware memory region. */ void physmem_hardware_region(uint64_t pa, uint64_t sz) { vm_offset_t adj; /* * Filter out the page at PA 0x00000000. The VM can't handle it, as * pmap_extract() == 0 means failure. */ if (pa == 0) { if (sz <= PAGE_SIZE) return; pa = PAGE_SIZE; sz -= PAGE_SIZE; } else if (pa > MAX_PHYS_ADDR) { /* This range is past usable memory, ignore it */ return; } /* * Also filter out the page at the end of the physical address space -- * if addr is non-zero and addr+size is zero we wrapped to the next byte * beyond what vm_paddr_t can express. That leads to a NULL pointer * deref early in startup; work around it by leaving the last page out. * * XXX This just in: subtract out a whole megabyte, not just 1 page. * Reducing the size by anything less than 1MB results in the NULL * pointer deref in _vm_map_lock_read(). Better to give up a megabyte * than leave some folks with an unusable system while we investigate. */ if ((pa + sz) > (MAX_PHYS_ADDR - 1024 * 1024)) { sz = MAX_PHYS_ADDR - pa + 1; if (sz <= 1024 * 1024) return; sz -= 1024 * 1024; } /* * Round the starting address up to a page boundary, and truncate the * ending page down to a page boundary. */ adj = round_page(pa) - pa; pa = round_page(pa); sz = trunc_page(sz - adj); if (sz > 0 && hwcnt < nitems(hwregions)) hwcnt = insert_region(hwregions, hwcnt, pa, sz, 0); } /* * Add an exclusion region. */ void physmem_exclude_region(vm_paddr_t pa, vm_size_t sz, uint32_t exflags) { vm_offset_t adj; /* * Truncate the starting address down to a page boundary, and round the * ending page up to a page boundary. */ adj = pa - trunc_page(pa); pa = trunc_page(pa); sz = round_page(sz + adj); if (excnt >= nitems(exregions)) panic("failed to exclude region %#jx-%#jx", (uintmax_t)pa, (uintmax_t)(pa + sz)); excnt = insert_region(exregions, excnt, pa, sz, exflags); } size_t physmem_avail(vm_paddr_t *avail, size_t maxavail) { return (regions_to_avail(avail, EXFLAG_NOALLOC, maxavail, NULL, NULL)); } /* * Process all the regions added earlier into the global avail lists. * * Updates the kernel global 'physmem' with the number of physical pages * available for use (all pages not in any exclusion region). * * Updates the kernel global 'Maxmem' with the page number one greater then the * last page of physical memory in the system. */ void physmem_init_kernel_globals(void) { size_t nextidx; regions_to_avail(dump_avail, EXFLAG_NODUMP, PHYS_AVAIL_ENTRIES, NULL, NULL); nextidx = regions_to_avail(phys_avail, EXFLAG_NOALLOC, PHYS_AVAIL_ENTRIES, &physmem, &realmem); if (nextidx == 0) panic("No memory entries in phys_avail"); Maxmem = atop(phys_avail[nextidx - 1]); } #ifdef DDB #include DB_SHOW_COMMAND(physmem, db_show_physmem) { physmem_dump_tables(db_printf); } #endif /* DDB */ Index: head/sys/mips/atheros/ar531x/ar5315_machdep.c =================================================================== --- head/sys/mips/atheros/ar531x/ar5315_machdep.c (revision 366710) +++ head/sys/mips/atheros/ar531x/ar5315_machdep.c (revision 366711) @@ -1,322 +1,323 @@ /*- * Copyright (c) 2016, Hiroki Mori * Copyright (c) 2009 Oleksandr Tymoshenko * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ar531x.h" #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include extern char edata[], end[]; uint32_t ar711_base_mac[ETHER_ADDR_LEN]; /* 4KB static data aread to keep a copy of the bootload env until the dynamic kenv is setup */ char boot1_env[4096]; void platform_cpu_init() { /* Nothing special */ } void platform_reset(void) { ar531x_device_reset(); /* Wait for reset */ while(1) ; } /* * Obtain the MAC address via the Redboot environment. */ static void ar5315_redboot_get_macaddr(void) { char *var; int count = 0; /* * "ethaddr" is passed via envp on RedBoot platforms * "kmac" is passed via argv on RouterBOOT platforms */ if ((var = kern_getenv("ethaddr")) != NULL || (var = kern_getenv("kmac")) != NULL) { count = sscanf(var, "%x%*c%x%*c%x%*c%x%*c%x%*c%x", &ar711_base_mac[0], &ar711_base_mac[1], &ar711_base_mac[2], &ar711_base_mac[3], &ar711_base_mac[4], &ar711_base_mac[5]); if (count < 6) memset(ar711_base_mac, 0, sizeof(ar711_base_mac)); freeenv(var); } } #if defined(SOC_VENDOR) || defined(SOC_MODEL) || defined(SOC_REV) static SYSCTL_NODE(_hw, OID_AUTO, soc, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "System on Chip information"); #endif #if defined(SOC_VENDOR) static char hw_soc_vendor[] = SOC_VENDOR; SYSCTL_STRING(_hw_soc, OID_AUTO, vendor, CTLFLAG_RD, hw_soc_vendor, 0, "SoC vendor"); #endif #if defined(SOC_MODEL) static char hw_soc_model[] = SOC_MODEL; SYSCTL_STRING(_hw_soc, OID_AUTO, model, CTLFLAG_RD, hw_soc_model, 0, "SoC model"); #endif #if defined(SOC_REV) static char hw_soc_revision[] = SOC_REV; SYSCTL_STRING(_hw_soc, OID_AUTO, revision, CTLFLAG_RD, hw_soc_revision, 0, "SoC revision"); #endif #if defined(DEVICE_VENDOR) || defined(DEVICE_MODEL) || defined(DEVICE_REV) static SYSCTL_NODE(_hw, OID_AUTO, device, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Board information"); #endif #if defined(DEVICE_VENDOR) static char hw_device_vendor[] = DEVICE_VENDOR; SYSCTL_STRING(_hw_device, OID_AUTO, vendor, CTLFLAG_RD, hw_device_vendor, 0, "Board vendor"); #endif #if defined(DEVICE_MODEL) static char hw_device_model[] = DEVICE_MODEL; SYSCTL_STRING(_hw_device, OID_AUTO, model, CTLFLAG_RD, hw_device_model, 0, "Board model"); #endif #if defined(DEVICE_REV) static char hw_device_revision[] = DEVICE_REV; SYSCTL_STRING(_hw_device, OID_AUTO, revision, CTLFLAG_RD, hw_device_revision, 0, "Board revision"); #endif void platform_start(__register_t a0 __unused, __register_t a1 __unused, __register_t a2 __unused, __register_t a3 __unused) { uint64_t platform_counter_freq; int argc = 0, i; char **argv = NULL; #ifndef AR531X_ENV_UBOOT char **envp = NULL; #endif vm_offset_t kernend; /* * clear the BSS and SBSS segments, this should be first call in * the function */ kernend = (vm_offset_t)&end; memset(&edata, 0, kernend - (vm_offset_t)(&edata)); mips_postboot_fixup(); /* Initialize pcpu stuff */ mips_pcpu0_init(); /* * Until some more sensible abstractions for uboot/redboot * environment handling, we have to make this a compile-time * hack. The existing code handles the uboot environment * very incorrectly so we should just ignore initialising * the relevant pointers. */ #ifndef AR531X_ENV_UBOOT argc = a0; argv = (char**)a1; envp = (char**)a2; #endif /* * Protect ourselves from garbage in registers */ if (MIPS_IS_VALID_PTR(envp)) { for (i = 0; envp[i]; i += 2) { if (strcmp(envp[i], "memsize") == 0) realmem = btoc(strtoul(envp[i+1], NULL, 16)); } } ar5315_detect_sys_type(); // RedBoot SDRAM Detect is missing // ar531x_detect_mem_size(); /* * Just wild guess. RedBoot let us down and didn't reported * memory size */ if (realmem == 0) realmem = btoc(16*1024*1024); /* * Allow build-time override in case Redboot lies * or in other situations (eg where there's u-boot) * where there isn't (yet) a convienent method of * being told how much RAM is available. * * This happens on at least the Ubiquiti LS-SR71A * board, where redboot says there's 16mb of RAM * but in fact there's 32mb. */ #if defined(AR531X_REALMEM) realmem = btoc(AR531X_REALMEM); #endif /* phys_avail regions are in bytes */ phys_avail[0] = MIPS_KSEG0_TO_PHYS(kernel_kseg0_end); phys_avail[1] = ctob(realmem); dump_avail[0] = phys_avail[0]; dump_avail[1] = phys_avail[1] - phys_avail[0]; physmem = realmem; /* * ns8250 uart code uses DELAY so ticker should be inititalized * before cninit. And tick_init_params refers to hz, so * init_param1 * should be called first. */ init_param1(); boothowto |= (RB_SERIAL | RB_MULTIPLE); /* Use multiple consoles */ // boothowto |= RB_VERBOSE; // boothowto |= (RB_SINGLE); /* Detect the system type - this is needed for subsequent chipset-specific calls */ ar531x_device_soc_init(); ar531x_detect_sys_frequency(); platform_counter_freq = ar531x_cpu_freq(); mips_timer_init_params(platform_counter_freq, 1); cninit(); init_static_kenv(boot1_env, sizeof(boot1_env)); printf("CPU platform: %s\n", ar5315_get_system_type()); printf("CPU Frequency=%d MHz\n", ar531x_cpu_freq() / 1000000); printf("CPU DDR Frequency=%d MHz\n", ar531x_ddr_freq() / 1000000); printf("CPU AHB Frequency=%d MHz\n", ar531x_ahb_freq() / 1000000); printf("platform frequency: %lld\n", platform_counter_freq); printf("arguments: \n"); printf(" a0 = %08x\n", a0); printf(" a1 = %08x\n", a1); printf(" a2 = %08x\n", a2); printf(" a3 = %08x\n", a3); strcpy(cpu_model, ar5315_get_system_type()); /* * XXX this code is very redboot specific. */ printf("Cmd line:"); if (MIPS_IS_VALID_PTR(argv)) { for (i = 0; i < argc; i++) { printf(" %s", argv[i]); boothowto |= boot_parse_arg(argv[i]); } } else printf ("argv is invalid"); printf("\n"); printf("Environment:\n"); #if 0 if (MIPS_IS_VALID_PTR(envp)) { if (envp[0] && strchr(envp[0], '=') ) { char *env_val; // for (i = 0; envp[i]; i++) { env_val = strchr(envp[i], '='); /* Not sure if we correct to change data, but env in RAM */ *(env_val++) = '\0'; printf("= %s = %s\n", envp[i], env_val); kern_setenv(envp[i], env_val); } } else { for (i = 0; envp[i]; i+=2) { printf(" %s = %s\n", envp[i], envp[i+1]); kern_setenv(envp[i], envp[i+1]); } } } else printf ("envp is invalid\n"); #else printf ("envp skiped\n"); #endif /* Redboot if_are MAC address is in the environment */ ar5315_redboot_get_macaddr(); init_param2(physmem); mips_cpu_init(); pmap_bootstrap(); mips_proc0_init(); mutex_init(); ar531x_device_start(); kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } Index: head/sys/mips/atheros/ar71xx_machdep.c =================================================================== --- head/sys/mips/atheros/ar71xx_machdep.c (revision 366710) +++ head/sys/mips/atheros/ar71xx_machdep.c (revision 366711) @@ -1,453 +1,454 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009 Oleksandr Tymoshenko * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ar71xx.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include extern char edata[], end[]; /* 4KB static data aread to keep a copy of the bootload env until the dynamic kenv is setup */ char boot1_env[4096]; void platform_cpu_init() { /* Nothing special */ } void platform_reset(void) { ar71xx_device_stop(RST_RESET_FULL_CHIP); /* Wait for reset */ while(1) ; } /* * Obtain the MAC address via the Redboot environment. */ static int ar71xx_redboot_get_macaddr(void) { char *var; int count = 0, i; uint32_t macaddr[ETHER_ADDR_LEN]; uint8_t tmpmac[ETHER_ADDR_LEN]; /* * "ethaddr" is passed via envp on RedBoot platforms * "kmac" is passed via argv on RouterBOOT platforms */ if ((var = kern_getenv("ethaddr")) != NULL || (var = kern_getenv("kmac")) != NULL) { count = sscanf(var, "%x%*c%x%*c%x%*c%x%*c%x%*c%x", &macaddr[0], &macaddr[1], &macaddr[2], &macaddr[3], &macaddr[4], &macaddr[5]); if (count < 6) { memset(macaddr, 0, sizeof(macaddr)); } else { for (i = 0; i < ETHER_ADDR_LEN; i++) tmpmac[i] = macaddr[i] & 0xff; (void) ar71xx_mac_addr_init(ar71xx_board_mac_addr, tmpmac, 0, /* offset */ 0); /* is_local */ } freeenv(var); return (0); } return (-1); } #ifdef AR71XX_ENV_ROUTERBOOT /* * RouterBoot gives us the board memory in a command line argument. */ static int ar71xx_routerboot_get_mem(int argc, char **argv) { int i, board_mem; /* * Protect ourselves from garbage in registers. */ if (!MIPS_IS_VALID_PTR(argv)) return (0); for (i = 0; i < argc; i++) { if (argv[i] == NULL) continue; if (strncmp(argv[i], "mem=", 4) == 0) { if (sscanf(argv[i] + 4, "%dM", &board_mem) == 1) return (btoc(board_mem * 1024 * 1024)); } } return (0); } #endif /* * Handle initialising the MAC address from a specific EEPROM * offset. * * This is done during (very) early boot. * * hint.ar71xx.0.eeprom_mac_addr=
* hint.ar71xx.0.eeprom_mac_isascii=<0|1> */ static int ar71xx_platform_read_eeprom_mac(void) { long eeprom_mac_addr = 0; const char *mac; int i, readascii = 0; uint8_t macaddr[ETHER_ADDR_LEN]; if (resource_long_value("ar71xx", 0, "eeprom_mac_addr", &eeprom_mac_addr) != 0) return (-1); /* get a pointer to the EEPROM MAC address */ mac = (const char *) MIPS_PHYS_TO_KSEG1(eeprom_mac_addr); /* Check if it's ASCII or not */ if (resource_int_value("ar71xx", 0, "eeprom_mac_isascii", &readascii) == 0 && readascii == 1) { printf("ar71xx: Overriding MAC from EEPROM (ascii)\n"); for (i = 0; i < 6; i++) { macaddr[i] = strtol(&(mac[i*3]), NULL, 16); } } else { printf("ar71xx: Overriding MAC from EEPROM\n"); for (i = 0; i < 6; i++) { macaddr[i] = mac[i]; } } /* Set the default board MAC */ (void) ar71xx_mac_addr_init(ar71xx_board_mac_addr, macaddr, 0, /* offset */ 0); /* is_local */ printf("ar71xx: Board MAC: %6D\n", ar71xx_board_mac_addr, ":"); return (0); } /* * Populate a kenv hint for the given device based on the given * MAC address and offset. * * Returns 0 if ok, < 0 on error. */ static int ar71xx_platform_set_mac_hint(const char *dev, int unit, const uint8_t *macaddr, int offset, int islocal) { char macstr[32]; uint8_t lclmac[ETHER_ADDR_LEN]; char devstr[32]; /* Initialise the MAC address, plus/minus the offset */ if (ar71xx_mac_addr_init(lclmac, macaddr, offset, islocal) != 0) { return (-1); } /* Turn it into a string */ snprintf(macstr, 32, "%6D", lclmac, ":"); snprintf(devstr, 32, "hint.%s.%d.macaddr", dev, unit); printf(" %s => %s\n", devstr, macstr); /* Call setenv */ if (kern_setenv(devstr, macstr) != 0) { printf("%s: failed to set hint (%s => %s)\n", __func__, devstr, macstr); return (-1); } return (0); } /* * Iterate through the list of boot time hints that populate * a device MAC address hint based on the "board" MAC address. * * ar71xx_mac_map.X.devid= * ar71xx_mac_map.X.unitid= * ar71xx_mac_map.X.offset= * ar71xx_mac_map.X.is_local=<1 or 0> */ static int ar71xx_platform_check_mac_hints(void) { int i; const char *devid; int offset, is_local, unitid; for (i = 0; i < 8; i++) { if (resource_string_value("ar71xx_mac_map", i, "devid", &devid) != 0) break; if (resource_int_value("ar71xx_mac_map", i, "unitid", &unitid) != 0) break; if (resource_int_value("ar71xx_mac_map", i, "offset", &offset) != 0) break; if (resource_int_value("ar71xx_mac_map", i, "is_local", &is_local) != 0) break; printf("ar71xx: devid '%s.%d', MAC offset '%d'\n", devid, unitid, offset); (void) ar71xx_platform_set_mac_hint(devid, unitid, ar71xx_board_mac_addr, offset, is_local); } return (0); } void platform_start(__register_t a0 __unused, __register_t a1 __unused, __register_t a2 __unused, __register_t a3 __unused) { uint64_t platform_counter_freq; int argc = 0, i; char **argv = NULL, **envp = NULL; vm_offset_t kernend; /* * clear the BSS and SBSS segments, this should be first call in * the function */ kernend = (vm_offset_t)&end; memset(&edata, 0, kernend - (vm_offset_t)(&edata)); mips_postboot_fixup(); /* Initialize pcpu stuff */ mips_pcpu0_init(); /* * Until some more sensible abstractions for uboot/redboot * environment handling, we have to make this a compile-time * hack. The existing code handles the uboot environment * very incorrectly so we should just ignore initialising * the relevant pointers. */ #ifndef AR71XX_ENV_UBOOT argc = a0; argv = (char**)a1; envp = (char**)a2; #endif /* * Protect ourselves from garbage in registers */ if (MIPS_IS_VALID_PTR(envp)) { for (i = 0; envp[i]; i += 2) { if (strcmp(envp[i], "memsize") == 0) realmem = btoc(strtoul(envp[i+1], NULL, 16)); else if (strcmp(envp[i], "bootverbose") == 0) bootverbose = btoc(strtoul(envp[i+1], NULL, 10)); } } bootverbose = 1; #ifdef AR71XX_ENV_ROUTERBOOT /* * RouterBoot informs the board memory as a command line argument. */ if (realmem == 0) realmem = ar71xx_routerboot_get_mem(argc, argv); #endif /* * Just wild guess. RedBoot let us down and didn't reported * memory size */ if (realmem == 0) realmem = btoc(32*1024*1024); /* * Allow build-time override in case Redboot lies * or in other situations (eg where there's u-boot) * where there isn't (yet) a convienent method of * being told how much RAM is available. * * This happens on at least the Ubiquiti LS-SR71A * board, where redboot says there's 16mb of RAM * but in fact there's 32mb. */ #if defined(AR71XX_REALMEM) realmem = btoc(AR71XX_REALMEM); #endif /* phys_avail regions are in bytes */ phys_avail[0] = MIPS_KSEG0_TO_PHYS(kernel_kseg0_end); phys_avail[1] = ctob(realmem); dump_avail[0] = 0; dump_avail[1] = phys_avail[1]; physmem = realmem; /* * ns8250 uart code uses DELAY so ticker should be inititalized * before cninit. And tick_init_params refers to hz, so * init_param1 * should be called first. */ init_param1(); /* Detect the system type - this is needed for subsequent chipset-specific calls */ ar71xx_detect_sys_type(); ar71xx_detect_sys_frequency(); platform_counter_freq = ar71xx_cpu_freq(); mips_timer_init_params(platform_counter_freq, 1); cninit(); init_static_kenv(boot1_env, sizeof(boot1_env)); printf("CPU platform: %s\n", ar71xx_get_system_type()); printf("CPU Frequency=%d MHz\n", u_ar71xx_cpu_freq / 1000000); printf("CPU DDR Frequency=%d MHz\n", u_ar71xx_ddr_freq / 1000000); printf("CPU AHB Frequency=%d MHz\n", u_ar71xx_ahb_freq / 1000000); printf("platform frequency: %lld MHz\n", platform_counter_freq / 1000000); printf("CPU reference clock: %d MHz\n", u_ar71xx_refclk / 1000000); printf("CPU MDIO clock: %d MHz\n", u_ar71xx_mdio_freq / 1000000); printf("arguments: \n"); printf(" a0 = %08x\n", a0); printf(" a1 = %08x\n", a1); printf(" a2 = %08x\n", a2); printf(" a3 = %08x\n", a3); strcpy(cpu_model, ar71xx_get_system_type()); /* * XXX this code is very redboot specific. */ printf("Cmd line:"); if (MIPS_IS_VALID_PTR(argv)) { for (i = 0; i < argc; i++) { printf(" %s", argv[i]); boothowto |= boot_parse_arg(argv[i]); } } else printf ("argv is invalid"); printf("\n"); printf("Environment:\n"); if (MIPS_IS_VALID_PTR(envp)) { for (i = 0; envp[i]; i+=2) { printf(" %s = %s\n", envp[i], envp[i+1]); kern_setenv(envp[i], envp[i+1]); } } else printf ("envp is invalid\n"); /* Platform setup */ init_param2(physmem); mips_cpu_init(); pmap_bootstrap(); mips_proc0_init(); mutex_init(); /* * Reset USB devices */ ar71xx_init_usb_peripheral(); /* * Reset internal ethernet switch, if one exists */ ar71xx_reset_ethernet_switch(); /* * Initialise the gmac driver. */ ar71xx_init_gmac(); /* Redboot if_arge MAC address is in the environment */ (void) ar71xx_redboot_get_macaddr(); /* Various other boards need things to come out of EEPROM */ (void) ar71xx_platform_read_eeprom_mac(); /* Initialise the MAC address hint map */ ar71xx_platform_check_mac_hints(); kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } Index: head/sys/mips/beri/beri_machdep.c =================================================================== --- head/sys/mips/beri/beri_machdep.c (revision 366710) +++ head/sys/mips/beri/beri_machdep.c (revision 366711) @@ -1,278 +1,279 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Wojciech A. Koszek * Copyright (c) 2012-2014 Robert N. M. Watson * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_platform.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef FDT #include #include #include #endif #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include extern int *edata; extern int *end; void platform_cpu_init() { /* Nothing special */ } static void mips_init(void) { int i; #ifdef FDT struct mem_region mr[FDT_MEM_REGIONS]; uint64_t val; int mr_cnt; int j; #endif for (i = 0; i < 10; i++) { phys_avail[i] = 0; } /* phys_avail regions are in bytes */ phys_avail[0] = MIPS_KSEG0_TO_PHYS(kernel_kseg0_end); phys_avail[1] = ctob(realmem); dump_avail[0] = phys_avail[0]; dump_avail[1] = phys_avail[1]; physmem = realmem; #ifdef FDT if (fdt_get_mem_regions(mr, &mr_cnt, &val) == 0) { physmem = btoc(val); KASSERT((phys_avail[0] >= mr[0].mr_start) && \ (phys_avail[0] < (mr[0].mr_start + mr[0].mr_size)), ("First region is not within FDT memory range")); /* Limit size of the first region */ phys_avail[1] = (mr[0].mr_start + MIN(mr[0].mr_size, ctob(realmem))); dump_avail[1] = phys_avail[1]; /* Add the rest of regions */ for (i = 1, j = 2; i < mr_cnt; i++, j+=2) { phys_avail[j] = mr[i].mr_start; phys_avail[j+1] = (mr[i].mr_start + mr[i].mr_size); dump_avail[j] = phys_avail[j]; dump_avail[j+1] = phys_avail[j+1]; } } #endif init_param1(); init_param2(physmem); mips_cpu_init(); pmap_bootstrap(); mips_proc0_init(); mutex_init(); kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } /* * Perform a board-level soft-reset. */ void platform_reset(void) { /* XXX SMP will likely require us to do more. */ __asm__ __volatile__( "mfc0 $k0, $12\n\t" "li $k1, 0x00100000\n\t" "or $k0, $k0, $k1\n\t" "mtc0 $k0, $12\n"); for( ; ; ) __asm__ __volatile("wait"); } void platform_start(__register_t a0, __register_t a1, __register_t a2, __register_t a3) { struct bootinfo *bootinfop; vm_offset_t kernend; uint64_t platform_counter_freq; int argc = a0; char **argv = (char **)a1; char **envp = (char **)a2; long memsize; #ifdef FDT vm_offset_t dtbp; void *kmdp; #endif int i; /* clear the BSS and SBSS segments */ kernend = (vm_offset_t)&end; memset(&edata, 0, kernend - (vm_offset_t)(&edata)); mips_postboot_fixup(); mips_pcpu0_init(); /* * Over time, we've changed out boot-time binary interface for the * kernel. Miniboot simply provides a 'memsize' in a3, whereas the * FreeBSD boot loader provides a 'bootinfo *' in a3. While slightly * grody, we support both here by detecting 'pointer-like' values in * a3 and assuming physical memory can never be that back. * * XXXRW: Pull more values than memsize out of bootinfop -- e.g., * module information. */ if (a3 >= 0x9800000000000000ULL) { bootinfop = (void *)a3; memsize = bootinfop->bi_memsize; preload_metadata = (caddr_t)bootinfop->bi_modulep; } else { bootinfop = NULL; memsize = a3; } #ifdef FDT /* * Find the dtb passed in by the boot loader (currently fictional). */ kmdp = preload_search_by_type("elf kernel"); dtbp = MD_FETCH(kmdp, MODINFOMD_DTBP, vm_offset_t); #if defined(FDT_DTB_STATIC) /* * In case the device tree blob was not retrieved (from metadata) try * to use the statically embedded one. */ if (dtbp == (vm_offset_t)NULL) dtbp = (vm_offset_t)&fdt_static_dtb; #else #error "Non-static FDT not yet supported on BERI" #endif if (OF_install(OFW_FDT, 0) == FALSE) while (1); if (OF_init((void *)dtbp) != 0) while (1); /* * Configure more boot-time parameters passed in by loader. */ boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); init_static_kenv(MD_FETCH(kmdp, MODINFOMD_ENVP, char *), 0); /* * Get bootargs from FDT if specified. */ ofw_parse_bootargs(); #endif /* * XXXRW: We have no way to compare wallclock time to cycle rate on * BERI, so for now assume we run at the MALTA default (100MHz). */ platform_counter_freq = MIPS_DEFAULT_HZ; mips_timer_early_init(platform_counter_freq); cninit(); printf("entry: platform_start()\n"); bootverbose = 1; if (bootverbose) { printf("cmd line: "); for (i = 0; i < argc; i++) printf("%s ", argv[i]); printf("\n"); printf("envp:\n"); for (i = 0; envp[i]; i += 2) printf("\t%s = %s\n", envp[i], envp[i+1]); if (bootinfop != NULL) printf("bootinfo found at %p\n", bootinfop); printf("memsize = %p\n", (void *)memsize); } realmem = btoc(memsize); mips_init(); mips_timer_init_params(platform_counter_freq, 0); } Index: head/sys/mips/broadcom/bcm_machdep.c =================================================================== --- head/sys/mips/broadcom/bcm_machdep.c (revision 366710) +++ head/sys/mips/broadcom/bcm_machdep.c (revision 366711) @@ -1,652 +1,653 @@ /*- * Copyright (c) 2007 Bruce M. Simpson. * Copyright (c) 2016 Michael Zhilin * Copyright (c) 2016 Landon Fuller * Copyright (c) 2017 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Landon Fuller * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bcm_machdep.h" #include "bcm_bmips_exts.h" #ifdef CFE #include #include #endif #if 0 #define BCM_TRACE(_fmt, ...) printf(_fmt, ##__VA_ARGS__) #else #define BCM_TRACE(_fmt, ...) #endif static int bcm_init_platform_data(struct bcm_platform *bp); static int bcm_find_core(struct bcm_platform *bp, const struct bhnd_core_match *descs, size_t num_descs, struct bhnd_core_info *info, uintptr_t *addr); static int bcm_erom_probe_and_attach(bhnd_erom_class_t **erom_cls, kobj_ops_t erom_ops, bhnd_erom_t *erom, size_t esize, struct bhnd_erom_io *eio, struct bhnd_chipid *cid); extern int *edata; extern int *end; static struct bcm_platform bcm_platform_data; static bool bcm_platform_data_avail = false; #ifdef CFE static struct bcm_nvram_iocfe bcm_cfe_nvram; #endif static const struct bhnd_core_match bcm_chipc_cores[] = { { BHND_MATCH_CORE(BHND_MFGID_BCM, BHND_COREID_CC) }, { BHND_MATCH_CORE(BHND_MFGID_BCM, BHND_COREID_4706_CC) }, }; static const struct bhnd_core_match bcm_cpu0_cores[] = { { BHND_MATCH_CORE_CLASS(BHND_DEVCLASS_CPU), BHND_MATCH_CORE_UNIT(0) } }; static const struct bhnd_core_match bcm_pmu_cores[] = { { BHND_MATCH_CORE(BHND_MFGID_BCM, BHND_COREID_PMU) }, }; struct bcm_platform * bcm_get_platform(void) { if (!bcm_platform_data_avail) panic("platform data not available"); return (&bcm_platform_data); } static bus_addr_t bcm_get_bus_addr(void) { long maddr; if (resource_long_value("bhnd", 0, "maddr", &maddr) == 0) return ((u_long)maddr); return (BHND_DEFAULT_CHIPC_ADDR); } static bus_size_t bcm_get_bus_size(void) { long msize; if (resource_long_value("bhnd", 0, "msize", &msize) == 0) return ((u_long)msize); return (BHND_DEFAULT_ENUM_SIZE); } /** * Search the device enumeration table for a core matching @p descs, * * @param bp Platform state containing a valid EROM parser. * @param descs The core match descriptor table. * @param num_descs The number of match descriptors in @p descs. * @param[out] info If non-NULL, will be populated with the core * info. * @param[out] addr If non-NULL, will be populated with the core's * physical register address. */ static int bcm_find_core(struct bcm_platform *bp, const struct bhnd_core_match *descs, size_t num_descs, struct bhnd_core_info *info, uintptr_t *addr) { bhnd_addr_t b_addr; bhnd_size_t b_size; int error; /* Fetch core info */ for (size_t i = 0; i < num_descs; i++) { error = bhnd_erom_lookup_core_addr(&bp->erom.obj, &descs[i], BHND_PORT_DEVICE, 0, 0, info, &b_addr, &b_size); /* Terminate search on first match */ if (error == 0) break; /* Terminate on first error (other than core not found) */ if (error != ENOENT) return (error); /* Continue search ... */ } /* Provide the core's base address */ if (addr != NULL && b_addr > UINTPTR_MAX) { BCM_ERR("core address %#jx overflows native address width\n", (uintmax_t)b_addr); return (ERANGE); } if (addr != NULL) *addr = b_addr; return (0); } /** * Read a variable directly from NVRAM, decoding as @p type. * * @param bp Platform state. * @param name The raw name of the variable to be fetched, * including any device path (/pci/1/1/varname) or * alias prefix (0:varname). * @param[out] buf On success, the requested value will be written * to this buffer. This argment may be NULL if * the value is not desired. * @param[in,out] len The capacity of @p buf. On success, will be set * to the actual size of the requested value. * @param type The data type to be written to @p buf. * * @retval 0 success * @retval ENOMEM If @p buf is non-NULL and a buffer of @p len is too * small to hold the requested value. * @retval ENOENT If @p name is not found. * @retval EFTYPE If the variable data cannot be coerced to @p type. * @retval ERANGE If value coercion would overflow @p type. * @retval non-zero If parsing NVRAM otherwise fails, a regular unix error * code will be returned. */ int bcm_get_nvram(struct bcm_platform *bp, const char *name, void *buf, size_t *len, bhnd_nvram_type type) { if (bp->nvram_io == NULL || bp->nvram_cls == NULL) return (ENOENT); return (bhnd_nvram_data_getvar_direct(bp->nvram_cls, bp->nvram_io, name, buf, len, type)); } /** * Probe and attach a bhnd_erom parser instance for the bhnd bus. * * @param[out] erom_cls The probed EROM class. * @param[out] erom_ops The storage to be used when compiling * @p erom_cls. * @param[out] erom The storage to be used when initializing the * static instance of @p erom_cls. * @param esize The total available number of bytes allocated * for @p erom. If this is less than is required * by @p erom_cls ENOMEM will be returned. * @param eio EROM I/O callbacks to be used. * @param[out] cid On success, the probed chip identification. */ static int bcm_erom_probe_and_attach(bhnd_erom_class_t **erom_cls, kobj_ops_t erom_ops, bhnd_erom_t *erom, size_t esize, struct bhnd_erom_io *eio, struct bhnd_chipid *cid) { bhnd_erom_class_t **clsp; bus_addr_t bus_addr; int error, prio, result; *erom_cls = NULL; prio = 0; /* Map our first bus core for the erom probe */ bus_addr = bcm_get_bus_addr(); if ((error = bhnd_erom_io_map(eio, bus_addr, BHND_DEFAULT_CORE_SIZE))) { BCM_ERR("failed to map first core at %#jx+%#jx: %d\n", (uintmax_t)bus_addr, (uintmax_t)BHND_DEFAULT_CORE_SIZE, error); return (error); } SET_FOREACH(clsp, bhnd_erom_class_set) { struct bhnd_chipid pcid; bhnd_erom_class_t *cls; struct kobj_ops kops; cls = *clsp; /* Compile the class' ops table */ kobj_class_compile_static(cls, &kops); /* Probe the bus address */ result = bhnd_erom_probe(cls, eio, NULL, &pcid); /* Drop pointer to stack allocated ops table */ cls->ops = NULL; /* The parser did not match if an error was returned */ if (result > 0) continue; /* Check for a new highest priority match */ if (*erom_cls == NULL || result > prio) { prio = result; *cid = pcid; *erom_cls = cls; } /* Terminate immediately on BUS_PROBE_SPECIFIC */ if (result == BUS_PROBE_SPECIFIC) break; } /* Valid EROM class probed? */ if (*erom_cls == NULL) { BCM_ERR("no erom parser found for root bus at %#jx\n", (uintmax_t)bus_addr); return (ENOENT); } /* Using the provided storage, recompile the erom class ... */ kobj_class_compile_static(*erom_cls, erom_ops); /* ... and initialize the erom parser instance */ error = bhnd_erom_init_static(*erom_cls, erom, esize, cid, eio); return (error); } /** * Populate platform configuration data. */ static int bcm_init_platform_data(struct bcm_platform *bp) { bus_addr_t bus_addr, bus_size; bus_space_tag_t erom_bst; bus_space_handle_t erom_bsh; bool aob, pmu; int error; bus_addr = bcm_get_bus_addr(); bus_size = bcm_get_bus_size(); #ifdef CFE /* Fetch CFE console handle (if any). Must be initialized before * any calls to printf/early_putc. */ if ((bp->cfe_console = cfe_getstdhandle(CFE_STDHANDLE_CONSOLE)) < 0) bp->cfe_console = -1; /* Probe CFE NVRAM sources */ bp->nvram_io = &bcm_cfe_nvram.io; error = bcm_nvram_find_cfedev(&bcm_cfe_nvram, &bp->nvram_cls); if (error) { bp->nvram_io = NULL; bp->nvram_cls = NULL; } #endif /* CFE */ /* Probe and attach device table provider, populating our * chip identification */ erom_bst = mips_bus_space_generic; erom_bsh = BCM_SOC_BSH(bus_addr, 0); error = bhnd_erom_iobus_init(&bp->erom_io, bus_addr, bus_size, erom_bst, erom_bsh); if (error) { BCM_ERR("failed to initialize erom I/O callbacks: %d\n", error); return (error); } error = bcm_erom_probe_and_attach(&bp->erom_impl, &bp->erom_ops, &bp->erom.obj, sizeof(bp->erom), &bp->erom_io.eio, &bp->cid); if (error) { BCM_ERR("error attaching erom parser: %d\n", error); bhnd_erom_io_fini(&bp->erom_io.eio); return (error); } if (bootverbose) bhnd_erom_dump(&bp->erom.obj); /* Fetch chipcommon core info */ error = bcm_find_core(bp, bcm_chipc_cores, nitems(bcm_chipc_cores), &bp->cc_id, &bp->cc_addr); if (error) { BCM_ERR("error locating chipc core: %d\n", error); return (error); } /* Fetch chipc capability flags */ bp->cc_caps = BCM_SOC_READ_4(bp->cc_addr, CHIPC_CAPABILITIES); bp->cc_caps_ext = 0x0; if (CHIPC_HWREV_HAS_CAP_EXT(bp->cc_id.hwrev)) bp->cc_caps_ext = BCM_CHIPC_READ_4(bp, CHIPC_CAPABILITIES_EXT); /* Fetch PMU info */ pmu = CHIPC_GET_FLAG(bp->cc_caps, CHIPC_CAP_PMU); aob = CHIPC_GET_FLAG(bp->cc_caps_ext, CHIPC_CAP2_AOB); if (pmu && aob) { /* PMU block mapped to a PMU core on the Always-on-Bus (aob) */ error = bcm_find_core(bp, bcm_pmu_cores, nitems(bcm_pmu_cores), &bp->pmu_id, &bp->pmu_addr); if (error) { BCM_ERR("error locating pmu core: %d\n", error); return (error); } } else if (pmu) { /* PMU block mapped to chipc */ bp->pmu_addr = bp->cc_addr; bp->pmu_id = bp->cc_id; } else { /* No PMU */ bp->pmu_addr = 0x0; memset(&bp->pmu_id, 0, sizeof(bp->pmu_id)); } /* Initialize PMU query state */ if (pmu) { error = bhnd_pmu_query_init(&bp->pmu, NULL, bp->cid, &bcm_pmu_soc_io, bp); if (error) { BCM_ERR("bhnd_pmu_query_init() failed: %d\n", error); return (error); } } /* Find CPU core info */ error = bcm_find_core(bp, bcm_cpu0_cores, nitems(bcm_cpu0_cores), &bp->cpu_id, &bp->cpu_addr); if (error) { BCM_ERR("error locating CPU core: %d\n", error); return (error); } /* Initialize our platform service registry */ if ((error = bhnd_service_registry_init(&bp->services))) { BCM_ERR("error initializing service registry: %d\n", error); return (error); } bcm_platform_data_avail = true; return (0); } void platform_cpu_init() { /* Nothing special */ } static void mips_init(void) { int i, j; printf("entry: mips_init()\n"); #ifdef CFE /* * Query DRAM memory map from CFE. */ physmem = 0; for (i = 0; i < 10; i += 2) { int result; uint64_t addr, len, type; result = cfe_enummem(i / 2, 0, &addr, &len, &type); if (result < 0) { BCM_TRACE("There is no phys memory for: %d\n", i); phys_avail[i] = phys_avail[i + 1] = 0; break; } if (type != CFE_MI_AVAILABLE) { BCM_TRACE("phys memory is not available: %d\n", i); continue; } phys_avail[i] = addr; if (i == 0 && addr == 0) { /* * If this is the first physical memory segment probed * from CFE, omit the region at the start of physical * memory where the kernel has been loaded. */ phys_avail[i] += MIPS_KSEG0_TO_PHYS(kernel_kseg0_end); } BCM_TRACE("phys memory is available for: %d\n", i); BCM_TRACE(" => addr = %jx\n", addr); BCM_TRACE(" => len = %jd\n", len); phys_avail[i + 1] = addr + len; physmem += len; } BCM_TRACE("Total phys memory is : %ld\n", physmem); realmem = btoc(physmem); #endif for (j = 0; j < i; j++) dump_avail[j] = phys_avail[j]; physmem = realmem; init_param1(); init_param2(physmem); mips_cpu_init(); pmap_bootstrap(); mips_proc0_init(); mutex_init(); kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } void platform_reset(void) { struct bcm_platform *bp; bool bcm4785war; printf("bcm::platform_reset()\n"); intr_disable(); #ifdef CFE /* Fall back on CFE if reset requested during platform * data initialization */ if (!bcm_platform_data_avail) { cfe_exit(0, 0); while (1); } #endif bp = bcm_get_platform(); bcm4785war = false; /* Handle BCM4785-specific behavior */ if (bp->cid.chip_id == BHND_CHIPID_BCM4785) { bcm4785war = true; /* Switch to async mode */ bcm_bmips_wr_pllcfg3(BMIPS_BCMCFG_PLLCFG3_SM); } /* Set watchdog (PMU or ChipCommon) */ if (bp->pmu_addr != 0x0) { BCM_PMU_WRITE_4(bp, BHND_PMU_WATCHDOG, 1); } else BCM_CHIPC_WRITE_4(bp, CHIPC_WATCHDOG, 1); /* BCM4785 */ if (bcm4785war) { mips_sync(); __asm __volatile("wait"); } while (1); } void platform_start(__register_t a0, __register_t a1, __register_t a2, __register_t a3) { vm_offset_t kernend; uint64_t platform_counter_freq; int error; /* clear the BSS and SBSS segments */ kernend = (vm_offset_t)&end; memset(&edata, 0, kernend - (vm_offset_t)(&edata)); mips_postboot_fixup(); /* Initialize pcpu stuff */ mips_pcpu0_init(); #ifdef CFE /* * Initialize CFE firmware trampolines. This must be done * before any CFE APIs are called, including writing * to the CFE console. * * CFE passes the following values in registers: * a0: firmware handle * a2: firmware entry point * a3: entry point seal */ if (a3 == CFE_EPTSEAL) cfe_init(a0, a2); #endif /* Init BCM platform data */ if ((error = bcm_init_platform_data(&bcm_platform_data))) panic("bcm_init_platform_data() failed: %d", error); platform_counter_freq = bcm_get_cpufreq(bcm_get_platform()); /* CP0 ticks every two cycles */ mips_timer_early_init(platform_counter_freq / 2); cninit(); mips_init(); mips_timer_init_params(platform_counter_freq, 1); } /* * CFE-based EARLY_PRINTF support. To use, add the following to the kernel * config: * option EARLY_PRINTF * option CFE * device cfe */ #if defined(EARLY_PRINTF) && defined(CFE) static void bcm_cfe_eputc(int c) { unsigned char ch; int handle; ch = (unsigned char) c; /* bcm_get_platform() cannot be used here, as we may be called * from bcm_init_platform_data(). */ if ((handle = bcm_platform_data.cfe_console) < 0) return; if (ch == '\n') early_putc('\r'); while ((cfe_write(handle, &ch, 1)) == 0) continue; } early_putc_t *early_putc = bcm_cfe_eputc; #endif /* EARLY_PRINTF */ Index: head/sys/mips/cavium/octeon_machdep.c =================================================================== --- head/sys/mips/cavium/octeon_machdep.c (revision 366710) +++ head/sys/mips/cavium/octeon_machdep.c (revision 366711) @@ -1,697 +1,698 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Wojciech A. Koszek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__mips_n64) #define MAX_APP_DESC_ADDR 0xffffffffafffffff #else #define MAX_APP_DESC_ADDR 0xafffffff #endif struct octeon_feature_description { octeon_feature_t ofd_feature; const char *ofd_string; }; extern int *end; static char octeon_kenv[0x2000]; static const struct octeon_feature_description octeon_feature_descriptions[] = { { OCTEON_FEATURE_SAAD, "SAAD" }, { OCTEON_FEATURE_ZIP, "ZIP" }, { OCTEON_FEATURE_CRYPTO, "CRYPTO" }, { OCTEON_FEATURE_DORM_CRYPTO, "DORM_CRYPTO" }, { OCTEON_FEATURE_PCIE, "PCIE" }, { OCTEON_FEATURE_SRIO, "SRIO" }, { OCTEON_FEATURE_KEY_MEMORY, "KEY_MEMORY" }, { OCTEON_FEATURE_LED_CONTROLLER, "LED_CONTROLLER" }, { OCTEON_FEATURE_TRA, "TRA" }, { OCTEON_FEATURE_MGMT_PORT, "MGMT_PORT" }, { OCTEON_FEATURE_RAID, "RAID" }, { OCTEON_FEATURE_USB, "USB" }, { OCTEON_FEATURE_NO_WPTR, "NO_WPTR" }, { OCTEON_FEATURE_DFA, "DFA" }, { OCTEON_FEATURE_MDIO_CLAUSE_45, "MDIO_CLAUSE_45" }, { OCTEON_FEATURE_NPEI, "NPEI" }, { OCTEON_FEATURE_ILK, "ILK" }, { OCTEON_FEATURE_HFA, "HFA" }, { OCTEON_FEATURE_DFM, "DFM" }, { OCTEON_FEATURE_CIU2, "CIU2" }, { OCTEON_FEATURE_DICI_MODE, "DICI_MODE" }, { OCTEON_FEATURE_BIT_EXTRACTOR, "BIT_EXTRACTOR" }, { OCTEON_FEATURE_NAND, "NAND" }, { OCTEON_FEATURE_MMC, "MMC" }, { OCTEON_FEATURE_PKND, "PKND" }, { OCTEON_FEATURE_CN68XX_WQE, "CN68XX_WQE" }, { 0, NULL } }; static uint64_t octeon_get_ticks(void); static unsigned octeon_get_timecount(struct timecounter *tc); static void octeon_boot_params_init(register_t ptr); static void octeon_init_kenv(register_t ptr); static struct timecounter octeon_timecounter = { octeon_get_timecount, /* get_timecount */ 0, /* no poll_pps */ 0xffffffffu, /* octeon_mask */ 0, /* frequency */ "Octeon", /* name */ 900, /* quality (adjusted in code) */ }; void platform_cpu_init() { /* Nothing special yet */ } /* * Perform a board-level soft-reset. */ void platform_reset(void) { cvmx_write_csr(CVMX_CIU_SOFT_RST, 1); } /* * octeon_debug_symbol * * Does nothing. * Used to mark the point for simulator to begin tracing */ void octeon_debug_symbol(void) { } /* * octeon_ciu_reset * * Shutdown all CIU to IP2, IP3 mappings */ void octeon_ciu_reset(void) { uint64_t cvmctl; /* Disable all CIU interrupts by default */ cvmx_write_csr(CVMX_CIU_INTX_EN0(cvmx_get_core_num()*2), 0); cvmx_write_csr(CVMX_CIU_INTX_EN0(cvmx_get_core_num()*2+1), 0); cvmx_write_csr(CVMX_CIU_INTX_EN1(cvmx_get_core_num()*2), 0); cvmx_write_csr(CVMX_CIU_INTX_EN1(cvmx_get_core_num()*2+1), 0); #ifdef SMP /* Enable the MBOX interrupts. */ cvmx_write_csr(CVMX_CIU_INTX_EN0(cvmx_get_core_num()*2+1), (1ull << (OCTEON_IRQ_MBOX0 - 8)) | (1ull << (OCTEON_IRQ_MBOX1 - 8))); #endif /* * Move the Performance Counter interrupt to OCTEON_PMC_IRQ */ cvmctl = mips_rd_cvmctl(); cvmctl &= ~(7 << 7); cvmctl |= (OCTEON_PMC_IRQ + 2) << 7; mips_wr_cvmctl(cvmctl); } static void octeon_memory_init(void) { vm_paddr_t phys_end; int64_t addr; unsigned i, j; phys_end = round_page(MIPS_KSEG0_TO_PHYS((vm_offset_t)&end)); if (cvmx_sysinfo_get()->board_type == CVMX_BOARD_TYPE_SIM) { /* Simulator we limit to 96 meg */ phys_avail[0] = phys_end; phys_avail[1] = 96 << 20; dump_avail[0] = phys_avail[0]; dump_avail[1] = phys_avail[1]; realmem = physmem = btoc(phys_avail[1] - phys_avail[0]); return; } /* * Allocate memory from bootmem 1MB at a time and merge * adjacent entries. */ i = 0; while (i < PHYS_AVAIL_ENTRIES) { /* * If there is less than 2MB of memory available in 128-byte * blocks, do not steal any more memory. We need to leave some * memory for the command queues to be allocated out of. */ if (cvmx_bootmem_available_mem(128) < 2 << 20) break; addr = cvmx_bootmem_phy_alloc(1 << 20, phys_end, ~(vm_paddr_t)0, PAGE_SIZE, 0); if (addr == -1) break; /* * The SDK needs to be able to easily map any memory that might * come to it e.g. in the form of an mbuf. Because on !n64 we * can't direct-map some addresses and we don't want to manage * temporary mappings within the SDK, don't feed memory that * can't be direct-mapped to the kernel. */ #if !defined(__mips_n64) if (!MIPS_DIRECT_MAPPABLE(addr + (1 << 20) - 1)) continue; #endif physmem += btoc(1 << 20); if (i > 0 && phys_avail[i - 1] == addr) { phys_avail[i - 1] += 1 << 20; continue; } phys_avail[i + 0] = addr; phys_avail[i + 1] = addr + (1 << 20); i += 2; } for (j = 0; j < i; j++) dump_avail[j] = phys_avail[j]; realmem = physmem; } void platform_start(__register_t a0, __register_t a1, __register_t a2 __unused, __register_t a3) { const struct octeon_feature_description *ofd; uint64_t platform_counter_freq; int rv; mips_postboot_fixup(); /* * Initialize boot parameters so that we can determine things like * which console we shoud use, etc. */ octeon_boot_params_init(a3); /* Initialize pcpu stuff */ mips_pcpu0_init(); mips_timer_early_init(cvmx_sysinfo_get()->cpu_clock_hz); /* Initialize console. */ cninit(); /* * Display information about the CPU. */ #if !defined(OCTEON_MODEL) printf("Using runtime CPU model checks.\n"); #else printf("Compiled for CPU model: " __XSTRING(OCTEON_MODEL) "\n"); #endif strcpy(cpu_model, octeon_model_get_string(cvmx_get_proc_id())); printf("CPU Model: %s\n", cpu_model); printf("CPU clock: %uMHz Core Mask: %#x\n", cvmx_sysinfo_get()->cpu_clock_hz / 1000000, cvmx_sysinfo_get()->core_mask); rv = octeon_model_version_check(cvmx_get_proc_id()); if (rv == -1) panic("%s: kernel not compatible with this processor.", __func__); /* * Display information about the board. */ #if defined(OCTEON_BOARD_CAPK_0100ND) strcpy(cpu_board, "CAPK-0100ND"); if (cvmx_sysinfo_get()->board_type != CVMX_BOARD_TYPE_CN3010_EVB_HS5) { panic("Compiled for %s, but board type is %s.", cpu_board, cvmx_board_type_to_string(cvmx_sysinfo_get()->board_type)); } #else strcpy(cpu_board, cvmx_board_type_to_string(cvmx_sysinfo_get()->board_type)); #endif printf("Board: %s\n", cpu_board); printf("Board Type: %u Revision: %u/%u\n", cvmx_sysinfo_get()->board_type, cvmx_sysinfo_get()->board_rev_major, cvmx_sysinfo_get()->board_rev_minor); printf("Serial number: %s\n", cvmx_sysinfo_get()->board_serial_number); /* * Additional on-chip hardware/settings. * * XXX Display PCI host/target? What else? */ printf("MAC address base: %6D (%u configured)\n", cvmx_sysinfo_get()->mac_addr_base, ":", cvmx_sysinfo_get()->mac_addr_count); octeon_ciu_reset(); /* * Convert U-Boot 'bootoctlinux' loader command line arguments into * boot flags and kernel environment variables. */ bootverbose = 1; octeon_init_kenv(a3); /* * For some reason on the cn38xx simulator ebase register is set to * 0x80001000 at bootup time. Move it back to the default, but * when we move to having support for multiple executives, we need * to rethink this. */ mips_wr_ebase(0x80000000); octeon_memory_init(); init_param1(); init_param2(physmem); mips_cpu_init(); pmap_bootstrap(); mips_proc0_init(); mutex_init(); kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif cpu_clock = cvmx_sysinfo_get()->cpu_clock_hz; platform_counter_freq = cpu_clock; octeon_timecounter.tc_frequency = cpu_clock; platform_timecounter = &octeon_timecounter; mips_timer_init_params(platform_counter_freq, 0); set_cputicker(octeon_get_ticks, cpu_clock, 0); #ifdef SMP /* * Clear any pending IPIs. */ cvmx_write_csr(CVMX_CIU_MBOX_CLRX(0), 0xffffffff); #endif printf("Octeon SDK: %s\n", OCTEON_SDK_VERSION_STRING); printf("Available Octeon features:"); for (ofd = octeon_feature_descriptions; ofd->ofd_string != NULL; ofd++) if (octeon_has_feature(ofd->ofd_feature)) printf(" %s", ofd->ofd_string); printf("\n"); } static uint64_t octeon_get_ticks(void) { uint64_t cvmcount; CVMX_MF_CYCLE(cvmcount); return (cvmcount); } static unsigned octeon_get_timecount(struct timecounter *tc) { return ((unsigned)octeon_get_ticks()); } static int sysctl_machdep_led_display(SYSCTL_HANDLER_ARGS) { size_t buflen; char buf[9]; int error; if (req->newptr == NULL) return (EINVAL); if (cvmx_sysinfo_get()->led_display_base_addr == 0) return (ENODEV); /* * Revision 1.x of the EBT3000 only supports 4 characters, but * other devices support 8. */ if (cvmx_sysinfo_get()->board_type == CVMX_BOARD_TYPE_EBT3000 && cvmx_sysinfo_get()->board_rev_major == 1) buflen = 4; else buflen = 8; if (req->newlen > buflen) return (E2BIG); error = SYSCTL_IN(req, buf, req->newlen); if (error != 0) return (error); buf[req->newlen] = '\0'; ebt3000_str_write(buf); return (0); } SYSCTL_PROC(_machdep, OID_AUTO, led_display, CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_machdep_led_display, "A", "String to display on LED display"); void cvmx_dvprintf(const char *fmt, va_list ap) { if (!bootverbose) return; vprintf(fmt, ap); } void cvmx_dprintf(const char *fmt, ...) { va_list ap; va_start(ap, fmt); cvmx_dvprintf(fmt, ap); va_end(ap); } /** * version of printf that works better in exception context. * * @param format * * XXX If this function weren't in cvmx-interrupt.c, we'd use the SDK version. */ void cvmx_safe_printf(const char *format, ...) { char buffer[256]; char *ptr = buffer; int count; va_list args; va_start(args, format); #ifndef __U_BOOT__ count = vsnprintf(buffer, sizeof(buffer), format, args); #else count = vsprintf(buffer, format, args); #endif va_end(args); while (count-- > 0) { cvmx_uart_lsr_t lsrval; /* Spin until there is room */ do { lsrval.u64 = cvmx_read_csr(CVMX_MIO_UARTX_LSR(0)); #if !defined(CONFIG_OCTEON_SIM_SPEED) if (lsrval.s.temt == 0) cvmx_wait(10000); /* Just to reduce the load on the system */ #endif } while (lsrval.s.temt == 0); if (*ptr == '\n') cvmx_write_csr(CVMX_MIO_UARTX_THR(0), '\r'); cvmx_write_csr(CVMX_MIO_UARTX_THR(0), *ptr++); } } /* impSTART: This stuff should move back into the Cavium SDK */ /* **************************************************************************************** * * APP/BOOT DESCRIPTOR STUFF * **************************************************************************************** */ /* Define the struct that is initialized by the bootloader used by the * startup code. * * Copyright (c) 2004, 2005, 2006 Cavium Networks. * * The authors hereby grant permission to use, copy, modify, distribute, * and license this software and its documentation for any purpose, provided * that existing copyright notices are retained in all copies and that this * notice is included verbatim in any distributions. No written agreement, * license, or royalty fee is required for any of the authorized uses. * Modifications to this software may be copyrighted by their authors * and need not follow the licensing terms described here, provided that * the new terms are clearly indicated on the first page of each file where * they apply. */ #define OCTEON_CURRENT_DESC_VERSION 6 #define OCTEON_ARGV_MAX_ARGS (64) #define OCTOEN_SERIAL_LEN 20 typedef struct { /* Start of block referenced by assembly code - do not change! */ uint32_t desc_version; uint32_t desc_size; uint64_t stack_top; uint64_t heap_base; uint64_t heap_end; uint64_t entry_point; /* Only used by bootloader */ uint64_t desc_vaddr; /* End of This block referenced by assembly code - do not change! */ uint32_t exception_base_addr; uint32_t stack_size; uint32_t heap_size; uint32_t argc; /* Argc count for application */ uint32_t argv[OCTEON_ARGV_MAX_ARGS]; uint32_t flags; uint32_t core_mask; uint32_t dram_size; /**< DRAM size in megabyes */ uint32_t phy_mem_desc_addr; /**< physical address of free memory descriptor block*/ uint32_t debugger_flags_base_addr; /**< used to pass flags from app to debugger */ uint32_t eclock_hz; /**< CPU clock speed, in hz */ uint32_t dclock_hz; /**< DRAM clock speed, in hz */ uint32_t spi_clock_hz; /**< SPI4 clock in hz */ uint16_t board_type; uint8_t board_rev_major; uint8_t board_rev_minor; uint16_t chip_type; uint8_t chip_rev_major; uint8_t chip_rev_minor; char board_serial_number[OCTOEN_SERIAL_LEN]; uint8_t mac_addr_base[6]; uint8_t mac_addr_count; uint64_t cvmx_desc_vaddr; } octeon_boot_descriptor_t; static cvmx_bootinfo_t * octeon_process_app_desc_ver_6(octeon_boot_descriptor_t *app_desc_ptr) { cvmx_bootinfo_t *octeon_bootinfo; /* XXX Why is 0x00000000ffffffffULL a bad value? */ if (app_desc_ptr->cvmx_desc_vaddr == 0 || app_desc_ptr->cvmx_desc_vaddr == 0xfffffffful) { cvmx_safe_printf("Bad octeon_bootinfo %#jx\n", (uintmax_t)app_desc_ptr->cvmx_desc_vaddr); return (NULL); } octeon_bootinfo = cvmx_phys_to_ptr(app_desc_ptr->cvmx_desc_vaddr); if (octeon_bootinfo->major_version != 1) { cvmx_safe_printf("Incompatible CVMX descriptor from bootloader: %d.%d %p\n", (int) octeon_bootinfo->major_version, (int) octeon_bootinfo->minor_version, octeon_bootinfo); return (NULL); } cvmx_sysinfo_minimal_initialize(octeon_bootinfo->phy_mem_desc_addr, octeon_bootinfo->board_type, octeon_bootinfo->board_rev_major, octeon_bootinfo->board_rev_minor, octeon_bootinfo->eclock_hz); memcpy(cvmx_sysinfo_get()->mac_addr_base, octeon_bootinfo->mac_addr_base, 6); cvmx_sysinfo_get()->mac_addr_count = octeon_bootinfo->mac_addr_count; cvmx_sysinfo_get()->compact_flash_common_base_addr = octeon_bootinfo->compact_flash_common_base_addr; cvmx_sysinfo_get()->compact_flash_attribute_base_addr = octeon_bootinfo->compact_flash_attribute_base_addr; cvmx_sysinfo_get()->core_mask = octeon_bootinfo->core_mask; cvmx_sysinfo_get()->led_display_base_addr = octeon_bootinfo->led_display_base_addr; memcpy(cvmx_sysinfo_get()->board_serial_number, octeon_bootinfo->board_serial_number, sizeof cvmx_sysinfo_get()->board_serial_number); return (octeon_bootinfo); } static void octeon_boot_params_init(register_t ptr) { octeon_boot_descriptor_t *app_desc_ptr; cvmx_bootinfo_t *octeon_bootinfo; if (ptr == 0 || ptr >= MAX_APP_DESC_ADDR) { cvmx_safe_printf("app descriptor passed at invalid address %#jx\n", (uintmax_t)ptr); platform_reset(); } app_desc_ptr = (octeon_boot_descriptor_t *)(intptr_t)ptr; if (app_desc_ptr->desc_version < 6) { cvmx_safe_printf("Your boot code is too old to be supported.\n"); platform_reset(); } octeon_bootinfo = octeon_process_app_desc_ver_6(app_desc_ptr); if (octeon_bootinfo == NULL) { cvmx_safe_printf("Could not parse boot descriptor.\n"); platform_reset(); } if (cvmx_sysinfo_get()->led_display_base_addr != 0) { /* * Revision 1.x of the EBT3000 only supports 4 characters, but * other devices support 8. */ if (cvmx_sysinfo_get()->board_type == CVMX_BOARD_TYPE_EBT3000 && cvmx_sysinfo_get()->board_rev_major == 1) ebt3000_str_write("FBSD"); else ebt3000_str_write("FreeBSD!"); } if (cvmx_sysinfo_get()->phy_mem_desc_addr == (uint64_t)0) { cvmx_safe_printf("Your boot loader did not supply a memory descriptor.\n"); platform_reset(); } cvmx_bootmem_init(cvmx_sysinfo_get()->phy_mem_desc_addr); octeon_feature_init(); __cvmx_helper_cfg_init(); } /* impEND: This stuff should move back into the Cavium SDK */ /* * The boot loader command line may specify kernel environment variables or * applicable boot flags of boot(8). */ static void octeon_init_kenv(register_t ptr) { int i; char *n; char *v; octeon_boot_descriptor_t *app_desc_ptr; app_desc_ptr = (octeon_boot_descriptor_t *)(intptr_t)ptr; memset(octeon_kenv, 0, sizeof(octeon_kenv)); init_static_kenv(octeon_kenv, sizeof(octeon_kenv)); for (i = 0; i < app_desc_ptr->argc; i++) { v = cvmx_phys_to_ptr(app_desc_ptr->argv[i]); if (v == NULL) continue; if (*v == '-') { boothowto |= boot_parse_arg(v); continue; } n = strsep(&v, "="); if (v == NULL) kern_setenv(n, "1"); else kern_setenv(n, v); } } Index: head/sys/mips/ingenic/jz4780_machdep.c =================================================================== --- head/sys/mips/ingenic/jz4780_machdep.c (revision 366710) +++ head/sys/mips/ingenic/jz4780_machdep.c (revision 366711) @@ -1,241 +1,242 @@ /*- * Copyright (c) 2009 Oleksandr Tymoshenko * Copyright (c) 2015 Alexander Kabaev * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_platform.h" #include #include #include #include #include #include #include #include #include #include #ifdef FDT #include #include #endif #include #include #include #include +#include #include #include #include #include #include #include #include #include #include uint32_t * const led = (uint32_t *)0xb0010548; extern char edata[], end[]; static char boot1_env[4096]; void platform_cpu_init(void) { uint32_t reg; /* * Do not expect mbox interrups while writing * mbox */ reg = mips_rd_xburst_reim(); reg &= ~JZ_REIM_MIRQ0M; mips_wr_xburst_reim(reg); /* Clean mailboxes */ mips_wr_xburst_mbox0(0); mips_wr_xburst_mbox1(0); mips_wr_xburst_core_sts(~JZ_CORESTS_MIRQ0P); /* Unmask mbox interrupts */ reg |= JZ_REIM_MIRQ0M; mips_wr_xburst_reim(reg); } void platform_reset(void) { /* * For now, provoke a watchdog reset in about a second, so UART buffers * have a fighting chance to flush before we pull the plug */ writereg(JZ_TCU_BASE + JZ_WDOG_TCER, 0); /* disable watchdog */ writereg(JZ_TCU_BASE + JZ_WDOG_TCNT, 0); /* reset counter */ writereg(JZ_TCU_BASE + JZ_WDOG_TDR, 128); /* wait for ~1s */ writereg(JZ_TCU_BASE + JZ_WDOG_TCSR, TCSR_RTC_EN | TCSR_DIV_256); writereg(JZ_TCU_BASE + JZ_WDOG_TCER, TCER_ENABLE); /* fire! */ /* Wait for reset */ while (1) ; } static void mips_init(void) { int i; #ifdef FDT struct mem_region mr[FDT_MEM_REGIONS]; uint64_t val; int mr_cnt; int j; #endif for (i = 0; i < 10; i++) { phys_avail[i] = 0; } /* The minimal amount of memory Ingenic SoC can have. */ dump_avail[0] = phys_avail[0] = MIPS_KSEG0_TO_PHYS(kernel_kseg0_end); physmem = realmem = btoc(32 * 1024 * 1024); /* * X1000 mips cpu special. * TODO: do anyone know what is this ? */ __asm( "li $2, 0xa9000000 \n\t" "mtc0 $2, $5, 4 \n\t" "nop \n\t" ::"r"(2)); #ifdef FDT if (fdt_get_mem_regions(mr, &mr_cnt, &val) == 0) { physmem = realmem = btoc(val); KASSERT((phys_avail[0] >= mr[0].mr_start) && \ (phys_avail[0] < (mr[0].mr_start + mr[0].mr_size)), ("First region is not within FDT memory range")); /* Limit size of the first region */ phys_avail[1] = (mr[0].mr_start + MIN(mr[0].mr_size, ctob(realmem))); dump_avail[1] = phys_avail[1]; /* Add the rest of regions */ for (i = 1, j = 2; i < mr_cnt; i++, j+=2) { phys_avail[j] = mr[i].mr_start; phys_avail[j+1] = (mr[i].mr_start + mr[i].mr_size); dump_avail[j] = phys_avail[j]; dump_avail[j+1] = phys_avail[j+1]; } } #endif init_param1(); init_param2(physmem); mips_cpu_init(); pmap_bootstrap(); mips_proc0_init(); mutex_init(); kdb_init(); led[0] = 0x8000; #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } void platform_start(__register_t a0, __register_t a1, __register_t a2 __unused, __register_t a3 __unused) { char **argv; int argc; vm_offset_t kernend; #ifdef FDT vm_offset_t dtbp; phandle_t chosen; char buf[2048]; /* early stack supposedly big enough */ #endif /* * clear the BSS and SBSS segments, this should be first call in * the function */ kernend = (vm_offset_t)&end; memset(&edata, 0, kernend - (vm_offset_t)(&edata)); mips_postboot_fixup(); /* Initialize pcpu stuff */ mips_pcpu0_init(); /* Something to hold kernel env until kmem is available */ init_static_kenv(boot1_env, sizeof(boot1_env)); #ifdef FDT /* * Find the dtb passed in by the boot loader (currently fictional). */ dtbp = (vm_offset_t)NULL; #if defined(FDT_DTB_STATIC) /* * In case the device tree blob was not retrieved (from metadata) try * to use the statically embedded one. */ if (dtbp == (vm_offset_t)NULL) dtbp = (vm_offset_t)&fdt_static_dtb; #else #error "Non-static FDT not supported on JZ4780" #endif if (OF_install(OFW_FDT, 0) == FALSE) while (1); if (OF_init((void *)dtbp) != 0) while (1); #endif cninit(); #ifdef FDT /* * Get bootargs from FDT if specified. */ chosen = OF_finddevice("/chosen"); if (OF_getprop(chosen, "bootargs", buf, sizeof(buf)) != -1) boothowto |= boot_parse_cmdline(buf); #endif /* Parse cmdline from U-Boot */ argc = a0; argv = (char **)a1; boothowto |= boot_parse_args(argc, argv); mips_init(); } Index: head/sys/mips/malta/malta_machdep.c =================================================================== --- head/sys/mips/malta/malta_machdep.c (revision 366710) +++ head/sys/mips/malta/malta_machdep.c (revision 366711) @@ -1,371 +1,372 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Wojciech A. Koszek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #ifdef TICK_USE_YAMON_FREQ #include #endif #ifdef TICK_USE_MALTA_RTC #include #include #include #endif #include extern int *edata; extern int *end; void lcd_init(void); void lcd_puts(char *); void malta_reset(void); /* * Temporary boot environment used at startup. */ static char boot1_env[4096]; /* * Offsets to MALTA LCD characters. */ static int malta_lcd_offs[] = { MALTA_ASCIIPOS0, MALTA_ASCIIPOS1, MALTA_ASCIIPOS2, MALTA_ASCIIPOS3, MALTA_ASCIIPOS4, MALTA_ASCIIPOS5, MALTA_ASCIIPOS6, MALTA_ASCIIPOS7 }; void platform_cpu_init() { /* Nothing special */ } /* * Put character to Malta LCD at given position. */ static void malta_lcd_putc(int pos, char c) { void *addr; char *ch; if (pos < 0 || pos > 7) return; addr = (void *)(MALTA_ASCII_BASE + malta_lcd_offs[pos]); ch = (char *)MIPS_PHYS_TO_KSEG0(addr); *ch = c; } /* * Print given string on LCD. */ static void malta_lcd_print(char *str) { int i; if (str == NULL) return; for (i = 0; *str != '\0'; i++, str++) malta_lcd_putc(i, *str); } void lcd_init(void) { malta_lcd_print("FreeBSD_"); } void lcd_puts(char *s) { malta_lcd_print(s); } #ifdef TICK_USE_MALTA_RTC static __inline uint8_t rtcin(uint8_t addr) { *((volatile uint8_t *) MIPS_PHYS_TO_KSEG1(MALTA_PCI0_ADDR(MALTA_RTCADR))) = addr; return (*((volatile uint8_t *) MIPS_PHYS_TO_KSEG1(MALTA_PCI0_ADDR(MALTA_RTCDAT)))); } static __inline void writertc(uint8_t addr, uint8_t val) { *((volatile uint8_t *) MIPS_PHYS_TO_KSEG1(MALTA_PCI0_ADDR(MALTA_RTCADR))) = addr; *((volatile uint8_t *) MIPS_PHYS_TO_KSEG1(MALTA_PCI0_ADDR(MALTA_RTCDAT))) = val; } #endif static void mips_init(unsigned long memsize, uint64_t ememsize) { int i; for (i = 0; i < PHYS_AVAIL_ENTRIES; i++) { phys_avail[i] = 0; } /* * memsize is the amount of RAM available below 256MB. * ememsize is the total amount of RAM available. * * The second bank starts at 0x90000000. */ /* phys_avail regions are in bytes */ phys_avail[0] = MIPS_KSEG0_TO_PHYS(kernel_kseg0_end); phys_avail[1] = memsize; dump_avail[0] = 0; dump_avail[1] = phys_avail[1]; /* Only specify the extended region if it's set */ if (ememsize > memsize) { phys_avail[2] = 0x90000000; phys_avail[3] = 0x90000000 + (ememsize - memsize); dump_avail[2] = phys_avail[2]; dump_avail[3] = phys_avail[3]; } /* XXX realmem assigned in the caller of mips_init() */ physmem = realmem; init_param1(); init_param2(physmem); mips_cpu_init(); pmap_bootstrap(); mips_proc0_init(); mutex_init(); kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } /* * Perform a board-level soft-reset. * Note that this is not emulated by gxemul. */ void platform_reset(void) { char *c; c = (char *)MIPS_PHYS_TO_KSEG0(MALTA_SOFTRES); *c = MALTA_GORESET; } static uint64_t malta_cpu_freq(void) { uint64_t platform_counter_freq = 0; #if defined(TICK_USE_YAMON_FREQ) /* * If we are running on a board which uses YAMON firmware, * then query CPU pipeline clock from the syscon object. * If unsuccessful, use hard-coded default. */ platform_counter_freq = yamon_getcpufreq(); #elif defined(TICK_USE_MALTA_RTC) /* * If we are running on a board with the MC146818 RTC, * use it to determine CPU pipeline clock frequency. */ u_int64_t counterval[2]; /* Set RTC to binary mode. */ writertc(RTC_STATUSB, (rtcin(RTC_STATUSB) | RTCSB_BCD)); /* Busy-wait for falling edge of RTC update. */ while (((rtcin(RTC_STATUSA) & RTCSA_TUP) == 0)) ; while (((rtcin(RTC_STATUSA)& RTCSA_TUP) != 0)) ; counterval[0] = mips_rd_count(); /* Busy-wait for falling edge of RTC update. */ while (((rtcin(RTC_STATUSA) & RTCSA_TUP) == 0)) ; while (((rtcin(RTC_STATUSA)& RTCSA_TUP) != 0)) ; counterval[1] = mips_rd_count(); platform_counter_freq = counterval[1] - counterval[0]; #endif if (platform_counter_freq == 0) platform_counter_freq = MIPS_DEFAULT_HZ; return (platform_counter_freq); } void platform_start(__register_t a0, __register_t a1, __register_t a2, __register_t a3) { vm_offset_t kernend; uint64_t platform_counter_freq; int argc = a0; int32_t *argv = (int32_t*)a1; int32_t *envp = (int32_t*)a2; unsigned int memsize = a3; uint64_t ememsize = 0; int i; /* clear the BSS and SBSS segments */ kernend = (vm_offset_t)&end; memset(&edata, 0, kernend - (vm_offset_t)(&edata)); mips_postboot_fixup(); mips_pcpu0_init(); platform_counter_freq = malta_cpu_freq(); mips_timer_early_init(platform_counter_freq); init_static_kenv(boot1_env, sizeof(boot1_env)); cninit(); printf("entry: platform_start()\n"); bootverbose = 1; /* * YAMON uses 32bit pointers to strings so * convert them to proper type manually */ if (bootverbose) { printf("cmd line: "); for (i = 0; i < argc; i++) printf("%s ", (char*)(intptr_t)argv[i]); printf("\n"); } if (bootverbose) printf("envp:\n"); /* * Parse the environment for things like ememsize. */ for (i = 0; envp[i]; i += 2) { const char *a, *v; a = (char *)(intptr_t)envp[i]; v = (char *)(intptr_t)envp[i+1]; if (bootverbose) printf("\t%s = %s\n", a, v); if (strcmp(a, "ememsize") == 0) { ememsize = strtoul(v, NULL, 0); } } if (bootverbose) { printf("memsize = %llu (0x%08x)\n", (unsigned long long) memsize, memsize); printf("ememsize = %llu\n", (unsigned long long) ememsize); #ifdef __mips_o32 /* * For O32 phys_avail[] can't address memory beyond 2^32, * so cap extended memory to 2GB minus one page. */ if (ememsize >= 2ULL * 1024 * 1024 * 1024) ememsize = 2ULL * 1024 * 1024 * 1024 - PAGE_SIZE; #endif } /* * For <= 256MB RAM amounts, ememsize should equal memsize. * For > 256MB RAM amounts it's the total RAM available; * split between two banks. * * XXX TODO: just push realmem assignment into mips_init() ? */ realmem = btoc(ememsize); mips_init(memsize, ememsize); mips_timer_init_params(platform_counter_freq, 0); } Index: head/sys/mips/mediatek/mtk_machdep.c =================================================================== --- head/sys/mips/mediatek/mtk_machdep.c (revision 366710) +++ head/sys/mips/mediatek/mtk_machdep.c (revision 366711) @@ -1,302 +1,303 @@ /*- * Copyright (C) 2015-2016 by Stanislav Galabov. All rights reserved. * Copyright (C) 2010-2011 by Aleksandr Rybalko. All rights reserved. * Copyright (C) 2007 by Oleksandr Tymoshenko. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_platform.h" #include "opt_rt305x.h" #include #include extern int *edata; extern int *end; static char boot1_env[0x1000]; void platform_cpu_init() { /* Nothing special */ } static void mips_init(void) { struct mem_region mr[FDT_MEM_REGIONS]; uint64_t val; int i, j, mr_cnt; char *memsize; printf("entry: mips_init()\n"); bootverbose = 1; for (i = 0; i < 10; i++) phys_avail[i] = 0; dump_avail[0] = phys_avail[0] = MIPS_KSEG0_TO_PHYS(kernel_kseg0_end); /* * The most low memory MT7621 can have. Currently MT7621 is the chip * that supports the most memory, so that seems reasonable. */ realmem = btoc(448 * 1024 * 1024); if (fdt_get_mem_regions(mr, &mr_cnt, &val) == 0) { physmem = btoc(val); printf("RAM size: %ldMB (from FDT)\n", ctob(physmem) / (1024 * 1024)); KASSERT((phys_avail[0] >= mr[0].mr_start) && \ (phys_avail[0] < (mr[0].mr_start + mr[0].mr_size)), ("First region is not within FDT memory range")); /* Limit size of the first region */ phys_avail[1] = (mr[0].mr_start + MIN(mr[0].mr_size, ctob(realmem))); dump_avail[1] = phys_avail[1]; /* Add the rest of the regions */ for (i = 1, j = 2; i < mr_cnt; i++, j+=2) { phys_avail[j] = mr[i].mr_start; phys_avail[j+1] = (mr[i].mr_start + mr[i].mr_size); dump_avail[j] = phys_avail[j]; dump_avail[j+1] = phys_avail[j+1]; } } else { if ((memsize = kern_getenv("memsize")) != NULL) { physmem = btoc(strtol(memsize, NULL, 0) << 20); printf("RAM size: %ldMB (from memsize)\n", ctob(physmem) / (1024 * 1024)); } else { /* All else failed, assume 32MB */ physmem = btoc(32 * 1024 * 1024); printf("RAM size: %ldMB (assumed)\n", ctob(physmem) / (1024 * 1024)); } if (mtk_soc_get_socid() == MTK_SOC_RT2880) { /* RT2880 memory start is 88000000 */ dump_avail[1] = phys_avail[1] = ctob(physmem) + 0x08000000; } else if (ctob(physmem) < (448 * 1024 * 1024)) { /* * Anything up to 448MB is assumed to be directly * mappable as low memory... */ dump_avail[1] = phys_avail[1] = ctob(physmem); } else if (mtk_soc_get_socid() == MTK_SOC_MT7621) { /* * On MT7621 the low memory is limited to 448MB, the * rest is high memory, mapped at 0x20000000 */ phys_avail[1] = 448 * 1024 * 1024; phys_avail[2] = 0x20000000; phys_avail[3] = phys_avail[2] + ctob(physmem) - phys_avail[1]; dump_avail[1] = phys_avail[1] - phys_avail[0]; dump_avail[2] = phys_avail[2]; dump_avail[3] = phys_avail[3] - phys_avail[2]; } else { /* * We have > 448MB RAM and we're not MT7621? Currently * there is no such chip, so we'll just limit the RAM to * 32MB and let the user know... */ printf("Unknown chip, assuming 32MB RAM\n"); physmem = btoc(32 * 1024 * 1024); dump_avail[1] = phys_avail[1] = ctob(physmem); } } if (physmem < realmem) realmem = physmem; init_param1(); init_param2(physmem); mips_cpu_init(); pmap_bootstrap(); mips_proc0_init(); mutex_init(); kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } void platform_reset(void) { mtk_soc_reset(); } void platform_start(__register_t a0 __unused, __register_t a1 __unused, __register_t a2 __unused, __register_t a3 __unused) { vm_offset_t kernend; int argc = a0, i;//, res; uint32_t timer_clk; char **argv = (char **)MIPS_PHYS_TO_KSEG0(a1); char **envp = (char **)MIPS_PHYS_TO_KSEG0(a2); void *dtbp; phandle_t chosen; char buf[2048]; /* clear the BSS and SBSS segments */ kernend = (vm_offset_t)&end; memset(&edata, 0, kernend - (vm_offset_t)(&edata)); mips_postboot_fixup(); /* Initialize pcpu stuff */ mips_pcpu0_init(); dtbp = &fdt_static_dtb; if (OF_install(OFW_FDT, 0) == FALSE) while (1); if (OF_init((void *)dtbp) != 0) while (1); mtk_soc_try_early_detect(); mtk_soc_set_cpu_model(); if ((timer_clk = mtk_soc_get_timerclk()) == 0) timer_clk = 1000000000; /* no such speed yet */ mips_timer_early_init(timer_clk); /* initialize console so that we have printf */ boothowto |= (RB_SERIAL | RB_MULTIPLE); /* Use multiple consoles */ boothowto |= (RB_VERBOSE); cninit(); init_static_kenv(boot1_env, sizeof(boot1_env)); /* * Get bsdbootargs from FDT if specified. */ chosen = OF_finddevice("/chosen"); if (OF_getprop(chosen, "bsdbootargs", buf, sizeof(buf)) != -1) boothowto |= boot_parse_cmdline(buf); printf("FDT DTB at: 0x%08x\n", (uint32_t)dtbp); printf("CPU clock: %4dMHz\n", mtk_soc_get_cpuclk()/(1000*1000)); printf("Timer clock: %4dMHz\n", timer_clk/(1000*1000)); printf("UART clock: %4dMHz\n\n", mtk_soc_get_uartclk()/(1000*1000)); printf("U-Boot args (from %d args):\n", argc - 1); if (argc == 1) printf("\tNone\n"); for (i = 1; i < argc; i++) { char *n = "argv ", *arg; if (i > 99) break; if (argv[i]) { arg = (char *)(intptr_t)MIPS_PHYS_TO_KSEG0(argv[i]); printf("\targv[%d] = %s\n", i, arg); sprintf(n, "argv%d", i); kern_setenv(n, arg); } } printf("Environment:\n"); for (i = 0; envp[i] && MIPS_IS_VALID_PTR(envp[i]); i++) { char *n, *arg; arg = (char *)(intptr_t)MIPS_PHYS_TO_KSEG0(envp[i]); if (! MIPS_IS_VALID_PTR(arg)) continue; printf("\t%s\n", arg); n = strsep(&arg, "="); if (arg == NULL) kern_setenv(n, "1"); else kern_setenv(n, arg); } mips_init(); mips_timer_init_params(timer_clk, 0); } Index: head/sys/mips/mips/minidump_machdep.c =================================================================== --- head/sys/mips/mips/minidump_machdep.c (revision 366710) +++ head/sys/mips/mips/minidump_machdep.c (revision 366711) @@ -1,326 +1,327 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Oleksandr Tymoshenko * Copyright (c) 2008 Semihalf, Grzegorz Bernacki * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: FreeBSD: src/sys/arm/arm/minidump_machdep.c v214223 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include +#include #include #include #include +#include #include #include #include -#include #include #include CTASSERT(sizeof(struct kerneldumpheader) == 512); static struct kerneldumpheader kdh; /* Handle chunked writes. */ static uint64_t counter, progress, dumpsize; /* Just auxiliary bufffer */ static char tmpbuffer[PAGE_SIZE]; extern pd_entry_t *kernel_segmap; static int is_dumpable(vm_paddr_t pa) { vm_page_t m; int i; if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) return ((m->flags & PG_NODUMP) == 0); for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) return (1); } return (0); } static struct { int min_per; int max_per; int visited; } progress_track[10] = { { 0, 10, 0}, { 10, 20, 0}, { 20, 30, 0}, { 30, 40, 0}, { 40, 50, 0}, { 50, 60, 0}, { 60, 70, 0}, { 70, 80, 0}, { 80, 90, 0}, { 90, 100, 0} }; static void report_progress(uint64_t progress, uint64_t dumpsize) { int sofar, i; sofar = 100 - ((progress * 100) / dumpsize); for (i = 0; i < nitems(progress_track); i++) { if (sofar < progress_track[i].min_per || sofar > progress_track[i].max_per) continue; if (progress_track[i].visited) return; progress_track[i].visited = 1; printf("..%d%%", sofar); return; } } static int write_buffer(struct dumperinfo *di, char *ptr, size_t sz) { size_t len; int error, c; u_int maxdumpsz; maxdumpsz = di->maxiosize; if (maxdumpsz == 0) /* seatbelt */ maxdumpsz = PAGE_SIZE; error = 0; while (sz) { len = min(maxdumpsz, sz); counter += len; progress -= len; if (counter >> 22) { report_progress(progress, dumpsize); counter &= (1<<22) - 1; } wdog_kern_pat(WD_LASTVAL); if (ptr) { error = dump_append(di, ptr, 0, len); if (error) return (error); ptr += len; sz -= len; } else { panic("pa is not supported"); } /* Check for user abort. */ c = cncheckc(); if (c == 0x03) return (ECANCELED); if (c != -1) printf(" (CTRL-C to abort) "); } return (0); } int minidumpsys(struct dumperinfo *di) { struct minidumphdr mdhdr; uint32_t ptesize; vm_paddr_t pa; vm_offset_t prev_pte = 0; uint32_t count = 0; vm_offset_t va; pt_entry_t *pte; int i, error; void *dump_va; /* Flush cache */ mips_dcache_wbinv_all(); counter = 0; /* Walk page table pages, set bits in vm_page_dump */ ptesize = 0; for (va = VM_MIN_KERNEL_ADDRESS; va < kernel_vm_end; va += NBPDR) { ptesize += PAGE_SIZE; pte = pmap_pte(kernel_pmap, va); KASSERT(pte != NULL, ("pte for %jx is NULL", (uintmax_t)va)); for (i = 0; i < NPTEPG; i++) { if (pte_test(&pte[i], PTE_V)) { pa = TLBLO_PTE_TO_PA(pte[i]); if (is_dumpable(pa)) dump_add_page(pa); } } } /* * Now mark pages from 0 to phys_avail[0], that's where kernel * and pages allocated by pmap_steal reside */ for (pa = 0; pa < phys_avail[0]; pa += PAGE_SIZE) { if (is_dumpable(pa)) dump_add_page(pa); } /* Calculate dump size. */ dumpsize = ptesize; dumpsize += round_page(msgbufp->msg_size); dumpsize += round_page(sizeof(dump_avail)); dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages)); VM_PAGE_DUMP_FOREACH(pa) { /* Clear out undumpable pages now if needed */ if (is_dumpable(pa)) dumpsize += PAGE_SIZE; else dump_drop_page(pa); } dumpsize += PAGE_SIZE; progress = dumpsize; /* Initialize mdhdr */ bzero(&mdhdr, sizeof(mdhdr)); strcpy(mdhdr.magic, MINIDUMP_MAGIC); mdhdr.version = MINIDUMP_VERSION; mdhdr.msgbufsize = msgbufp->msg_size; mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); mdhdr.ptesize = ptesize; mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS; mdhdr.dumpavailsize = round_page(sizeof(dump_avail)); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_MIPS_VERSION, dumpsize); error = dump_start(di, &kdh); if (error != 0) goto fail; printf("Dumping %llu out of %ju MB:", (long long)dumpsize >> 20, ptoa((uintmax_t)physmem) / 1048576); /* Dump my header */ bzero(tmpbuffer, sizeof(tmpbuffer)); bcopy(&mdhdr, tmpbuffer, sizeof(mdhdr)); error = write_buffer(di, tmpbuffer, PAGE_SIZE); if (error) goto fail; /* Dump msgbuf up front */ error = write_buffer(di, (char *)msgbufp->msg_ptr, round_page(msgbufp->msg_size)); if (error) goto fail; /* Dump dump_avail */ _Static_assert(sizeof(dump_avail) <= sizeof(tmpbuffer), "Large dump_avail not handled"); bzero(tmpbuffer, sizeof(tmpbuffer)); memcpy(tmpbuffer, dump_avail, sizeof(dump_avail)); error = write_buffer(di, tmpbuffer, PAGE_SIZE); if (error) goto fail; /* Dump bitmap */ error = write_buffer(di, (char *)vm_page_dump, round_page(BITSET_SIZE(vm_page_dump_pages))); if (error) goto fail; /* Dump kernel page table pages */ for (va = VM_MIN_KERNEL_ADDRESS; va < kernel_vm_end; va += NBPDR) { pte = pmap_pte(kernel_pmap, va); KASSERT(pte != NULL, ("pte for %jx is NULL", (uintmax_t)va)); if (!count) { prev_pte = (vm_offset_t)pte; count++; } else { if ((vm_offset_t)pte == (prev_pte + count * PAGE_SIZE)) count++; else { error = write_buffer(di, (char*)prev_pte, count * PAGE_SIZE); if (error) goto fail; count = 1; prev_pte = (vm_offset_t)pte; } } } if (count) { error = write_buffer(di, (char*)prev_pte, count * PAGE_SIZE); if (error) goto fail; count = 0; prev_pte = 0; } /* Dump memory chunks page by page*/ VM_PAGE_DUMP_FOREACH(pa) { dump_va = pmap_kenter_temporary(pa, 0); error = write_buffer(di, dump_va, PAGE_SIZE); if (error) goto fail; pmap_kenter_temporary_free(pa); } error = dump_finish(di, &kdh); if (error != 0) goto fail; printf("\nDump complete\n"); return (0); fail: if (error < 0) error = -error; if (error == ECANCELED) printf("\nDump aborted\n"); else if (error == E2BIG || error == ENOSPC) { printf("\nDump failed. Partition too small (about %lluMB were " "needed this time).\n", (long long)dumpsize >> 20); } else printf("\n** DUMP FAILED (ERROR %d) **\n", error); return (error); } Index: head/sys/mips/mips/pmap.c =================================================================== --- head/sys/mips/mips/pmap.c (revision 366710) +++ head/sys/mips/mips/pmap.c (revision 366711) @@ -1,3765 +1,3766 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * from: src/sys/i386/i386/pmap.c,v 1.250.2.8 2000/11/21 00:09:14 ps * JNPR: pmap.c,v 1.11.2.1 2007/08/16 11:51:06 girish */ /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_pmap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #undef PMAP_DEBUG #if !defined(DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_seg_index(v) (((v) >> SEGSHIFT) & (NPDEPG - 1)) #define pmap_pde_index(v) (((v) >> PDRSHIFT) & (NPDEPG - 1)) #define pmap_pte_index(v) (((v) >> PAGE_SHIFT) & (NPTEPG - 1)) #define pmap_pde_pindex(v) ((v) >> PDRSHIFT) #ifdef __mips_n64 #define NUPDE (NPDEPG * NPDEPG) #define NUSERPGTBLS (NUPDE + NPDEPG) #else #define NUPDE (NPDEPG) #define NUSERPGTBLS (NUPDE) #endif #define is_kernel_pmap(x) ((x) == kernel_pmap) struct pmap kernel_pmap_store; pd_entry_t *kernel_segmap; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static int need_local_mappings; static int nkpt; unsigned pmap_max_asid; /* max ASID supported by the system */ #define PMAP_ASID_RESERVED 0 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; static void pmap_asid_alloc(pmap_t pmap); static struct rwlock_padalign pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static int pv_entry_count; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static vm_page_t pmap_alloc_direct_page(unsigned int index, int req); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static void pmap_grow_direct_page(int req); static int pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t pde); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_page_t mpte, vm_offset_t va, vm_page_t m); static void pmap_update_page(pmap_t pmap, vm_offset_t va, pt_entry_t pte); static void pmap_invalidate_all(pmap_t pmap); static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, u_int flags); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t); static pt_entry_t init_pte_prot(vm_page_t m, vm_prot_t access, vm_prot_t prot); static void pmap_invalidate_page_action(void *arg); static void pmap_invalidate_range_action(void *arg); static void pmap_update_page_action(void *arg); #ifndef __mips_n64 static vm_offset_t crashdumpva; /* * These functions are for high memory (memory above 512Meg in 32 bit) support. * The highmem area does not have a KSEG0 mapping, and we need a mechanism to * do temporary per-CPU mappings for pmap_zero_page, pmap_copy_page etc. * * At bootup, we reserve 2 virtual pages per CPU for mapping highmem pages. To * access a highmem physical address on a CPU, we map the physical address to * the reserved virtual address for the CPU in the kernel pagetable. */ static void pmap_init_reserved_pages(void) { struct pcpu *pc; vm_offset_t pages; int i; if (need_local_mappings == 0) return; CPU_FOREACH(i) { pc = pcpu_find(i); /* * Skip if the mapping has already been initialized, * i.e. this is the BSP. */ if (pc->pc_cmap1_addr != 0) continue; pages = kva_alloc(PAGE_SIZE * 3); if (pages == 0) panic("%s: unable to allocate KVA", __func__); pc->pc_cmap1_ptep = pmap_pte(kernel_pmap, pages); pc->pc_cmap2_ptep = pmap_pte(kernel_pmap, pages + PAGE_SIZE); pc->pc_qmap_ptep = pmap_pte(kernel_pmap, pages + (PAGE_SIZE * 2)); pc->pc_cmap1_addr = pages; pc->pc_cmap2_addr = pages + PAGE_SIZE; pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); } } SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); static __inline void pmap_alloc_lmem_map(void) { PCPU_SET(cmap1_addr, virtual_avail); PCPU_SET(cmap2_addr, virtual_avail + PAGE_SIZE); PCPU_SET(cmap1_ptep, pmap_pte(kernel_pmap, virtual_avail)); PCPU_SET(cmap2_ptep, pmap_pte(kernel_pmap, virtual_avail + PAGE_SIZE)); PCPU_SET(qmap_addr, virtual_avail + (2 * PAGE_SIZE)); PCPU_SET(qmap_ptep, pmap_pte(kernel_pmap, virtual_avail + (2 * PAGE_SIZE))); crashdumpva = virtual_avail + (3 * PAGE_SIZE); virtual_avail += PAGE_SIZE * 4; } static __inline vm_offset_t pmap_lmem_map1(vm_paddr_t phys) { critical_enter(); *PCPU_GET(cmap1_ptep) = TLBLO_PA_TO_PFN(phys) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; return (PCPU_GET(cmap1_addr)); } static __inline vm_offset_t pmap_lmem_map2(vm_paddr_t phys1, vm_paddr_t phys2) { critical_enter(); *PCPU_GET(cmap1_ptep) = TLBLO_PA_TO_PFN(phys1) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; *PCPU_GET(cmap2_ptep) = TLBLO_PA_TO_PFN(phys2) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; return (PCPU_GET(cmap1_addr)); } static __inline void pmap_lmem_unmap(void) { *PCPU_GET(cmap1_ptep) = PTE_G; tlb_invalidate_address(kernel_pmap, PCPU_GET(cmap1_addr)); if (*PCPU_GET(cmap2_ptep) != PTE_G) { *PCPU_GET(cmap2_ptep) = PTE_G; tlb_invalidate_address(kernel_pmap, PCPU_GET(cmap2_addr)); } critical_exit(); } #else /* __mips_n64 */ static __inline void pmap_alloc_lmem_map(void) { } static __inline vm_offset_t pmap_lmem_map1(vm_paddr_t phys) { return (0); } static __inline vm_offset_t pmap_lmem_map2(vm_paddr_t phys1, vm_paddr_t phys2) { return (0); } static __inline vm_offset_t pmap_lmem_unmap(void) { return (0); } #endif /* !__mips_n64 */ static __inline int pmap_pte_cache_bits(vm_paddr_t pa, vm_page_t m) { vm_memattr_t ma; ma = pmap_page_get_memattr(m); if (ma == VM_MEMATTR_WRITE_BACK && !is_cacheable_mem(pa)) ma = VM_MEMATTR_UNCACHEABLE; return PTE_C(ma); } #define PMAP_PTE_SET_CACHE_BITS(pte, pa, m) { \ pte &= ~PTE_C_MASK; \ pte |= pmap_pte_cache_bits(pa, m); \ } /* * Page table entry lookup routines. */ static __inline pd_entry_t * pmap_segmap(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_segtab[pmap_seg_index(va)]); } #ifdef __mips_n64 static __inline pd_entry_t * pmap_pdpe_to_pde(pd_entry_t *pdpe, vm_offset_t va) { pd_entry_t *pde; pde = (pd_entry_t *)*pdpe; return (&pde[pmap_pde_index(va)]); } static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pd_entry_t *pdpe; pdpe = pmap_segmap(pmap, va); if (*pdpe == NULL) return (NULL); return (pmap_pdpe_to_pde(pdpe, va)); } #else static __inline pd_entry_t * pmap_pdpe_to_pde(pd_entry_t *pdpe, vm_offset_t va) { return (pdpe); } static __inline pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va) { return (pmap_segmap(pmap, va)); } #endif static __inline pt_entry_t * pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) { pt_entry_t *pte; pte = (pt_entry_t *)*pde; return (&pte[pmap_pte_index(va)]); } pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; pde = pmap_pde(pmap, va); if (pde == NULL || *pde == NULL) return (NULL); return (pmap_pde_to_pte(pde, va)); } vm_offset_t pmap_steal_memory(vm_size_t size) { vm_paddr_t bank_size, pa; vm_offset_t va; size = round_page(size); bank_size = phys_avail[1] - phys_avail[0]; while (size > bank_size) { int i; for (i = 0; phys_avail[i + 2]; i += 2) { phys_avail[i] = phys_avail[i + 2]; phys_avail[i + 1] = phys_avail[i + 3]; } phys_avail[i] = 0; phys_avail[i + 1] = 0; if (!phys_avail[0]) panic("pmap_steal_memory: out of memory"); bank_size = phys_avail[1] - phys_avail[0]; } pa = phys_avail[0]; phys_avail[0] += size; if (MIPS_DIRECT_MAPPABLE(pa) == 0) panic("Out of memory below 512Meg?"); va = MIPS_PHYS_TO_DIRECT(pa); bzero((caddr_t)va, size); return (va); } /* * Bootstrap the system enough to run with virtual memory. This * assumes that the phys_avail array has been initialized. */ static void pmap_create_kernel_pagetable(void) { int i, j; vm_offset_t ptaddr; pt_entry_t *pte; #ifdef __mips_n64 pd_entry_t *pde; vm_offset_t pdaddr; int npt, npde; #endif /* * Allocate segment table for the kernel */ kernel_segmap = (pd_entry_t *)pmap_steal_memory(PAGE_SIZE); /* * Allocate second level page tables for the kernel */ #ifdef __mips_n64 npde = howmany(NKPT, NPDEPG); pdaddr = pmap_steal_memory(PAGE_SIZE * npde); #endif nkpt = NKPT; ptaddr = pmap_steal_memory(PAGE_SIZE * nkpt); /* * The R[4-7]?00 stores only one copy of the Global bit in the * translation lookaside buffer for each 2 page entry. Thus invalid * entrys must have the Global bit set so when Entry LO and Entry HI * G bits are anded together they will produce a global bit to store * in the tlb. */ for (i = 0, pte = (pt_entry_t *)ptaddr; i < (nkpt * NPTEPG); i++, pte++) *pte = PTE_G; #ifdef __mips_n64 for (i = 0, npt = nkpt; npt > 0; i++) { kernel_segmap[i] = (pd_entry_t)(pdaddr + i * PAGE_SIZE); pde = (pd_entry_t *)kernel_segmap[i]; for (j = 0; j < NPDEPG && npt > 0; j++, npt--) pde[j] = (pd_entry_t)(ptaddr + (i * NPDEPG + j) * PAGE_SIZE); } #else for (i = 0, j = pmap_seg_index(VM_MIN_KERNEL_ADDRESS); i < nkpt; i++, j++) kernel_segmap[j] = (pd_entry_t)(ptaddr + (i * PAGE_SIZE)); #endif PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_segtab = kernel_segmap; CPU_FILL(&kernel_pmap->pm_active); TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_asid[0].asid = PMAP_ASID_RESERVED; kernel_pmap->pm_asid[0].gen = 0; kernel_vm_end += nkpt * NPTEPG * PAGE_SIZE; } void pmap_bootstrap(void) { int i; /* Sort. */ again: for (i = 0; phys_avail[i + 1] != 0; i += 2) { /* * Keep the memory aligned on page boundary. */ phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); if (i < 2) continue; if (phys_avail[i - 2] > phys_avail[i]) { vm_paddr_t ptemp[2]; ptemp[0] = phys_avail[i + 0]; ptemp[1] = phys_avail[i + 1]; phys_avail[i + 0] = phys_avail[i - 2]; phys_avail[i + 1] = phys_avail[i - 1]; phys_avail[i - 2] = ptemp[0]; phys_avail[i - 1] = ptemp[1]; goto again; } } /* * In 32 bit, we may have memory which cannot be mapped directly. * This memory will need temporary mapping before it can be * accessed. */ if (!MIPS_DIRECT_MAPPABLE(phys_avail[i - 1] - 1)) need_local_mappings = 1; /* * Copy the phys_avail[] array before we start stealing memory from it. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) { physmem_desc[i] = phys_avail[i]; physmem_desc[i + 1] = phys_avail[i + 1]; } Maxmem = atop(phys_avail[i - 1]); if (bootverbose) { printf("Physical memory chunk(s):\n"); for (i = 0; phys_avail[i + 1] != 0; i += 2) { vm_paddr_t size; size = phys_avail[i + 1] - phys_avail[i]; printf("%#08jx - %#08jx, %ju bytes (%ju pages)\n", (uintmax_t) phys_avail[i], (uintmax_t) phys_avail[i + 1] - 1, (uintmax_t) size, (uintmax_t) size / PAGE_SIZE); } printf("Maxmem is 0x%0jx\n", ptoa((uintmax_t)Maxmem)); } /* * Steal the message buffer from the beginning of memory. */ msgbufp = (struct msgbuf *)pmap_steal_memory(msgbufsize); msgbufinit(msgbufp, msgbufsize); /* * Steal thread0 kstack. */ kstack0 = pmap_steal_memory(KSTACK_PAGES << PAGE_SHIFT); virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_KERNEL_ADDRESS; #ifdef SMP /* * Steal some virtual address space to map the pcpu area. */ virtual_avail = roundup2(virtual_avail, PAGE_SIZE * 2); pcpup = (struct pcpu *)virtual_avail; virtual_avail += PAGE_SIZE * 2; /* * Initialize the wired TLB entry mapping the pcpu region for * the BSP at 'pcpup'. Up until this point we were operating * with the 'pcpup' for the BSP pointing to a virtual address * in KSEG0 so there was no need for a TLB mapping. */ mips_pcpu_tlb_init(PCPU_ADDR(0)); if (bootverbose) printf("pcpu is available at virtual address %p.\n", pcpup); #endif pmap_create_kernel_pagetable(); if (need_local_mappings) pmap_alloc_lmem_map(); pmap_max_asid = VMNUM_PIDS; mips_wr_entryhi(0); mips_wr_pagemask(0); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_flags = VM_MEMATTR_DEFAULT << PV_MEMATTR_SHIFT; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { } /*************************************************** * Low level helper routines..... ***************************************************/ #ifdef SMP static __inline void pmap_call_on_active_cpus(pmap_t pmap, void (*fn)(void *), void *arg) { int cpuid, cpu, self; cpuset_t active_cpus; sched_pin(); if (is_kernel_pmap(pmap)) { smp_rendezvous(NULL, fn, NULL, arg); goto out; } /* Force ASID update on inactive CPUs */ CPU_FOREACH(cpu) { if (!CPU_ISSET(cpu, &pmap->pm_active)) pmap->pm_asid[cpu].gen = 0; } cpuid = PCPU_GET(cpuid); /* * XXX: barrier/locking for active? * * Take a snapshot of active here, any further changes are ignored. * tlb update/invalidate should be harmless on inactive CPUs */ active_cpus = pmap->pm_active; self = CPU_ISSET(cpuid, &active_cpus); CPU_CLR(cpuid, &active_cpus); /* Optimize for the case where this cpu is the only active one */ if (CPU_EMPTY(&active_cpus)) { if (self) fn(arg); } else { if (self) CPU_SET(cpuid, &active_cpus); smp_rendezvous_cpus(active_cpus, NULL, fn, NULL, arg); } out: sched_unpin(); } #else /* !SMP */ static __inline void pmap_call_on_active_cpus(pmap_t pmap, void (*fn)(void *), void *arg) { int cpuid; if (is_kernel_pmap(pmap)) { fn(arg); return; } cpuid = PCPU_GET(cpuid); if (!CPU_ISSET(cpuid, &pmap->pm_active)) pmap->pm_asid[cpuid].gen = 0; else fn(arg); } #endif /* SMP */ static void pmap_invalidate_all(pmap_t pmap) { pmap_call_on_active_cpus(pmap, (void (*)(void *))tlb_invalidate_all_user, pmap); } struct pmap_invalidate_page_arg { pmap_t pmap; vm_offset_t va; }; static void pmap_invalidate_page_action(void *arg) { struct pmap_invalidate_page_arg *p = arg; tlb_invalidate_address(p->pmap, p->va); } static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { struct pmap_invalidate_page_arg arg; arg.pmap = pmap; arg.va = va; pmap_call_on_active_cpus(pmap, pmap_invalidate_page_action, &arg); } struct pmap_invalidate_range_arg { pmap_t pmap; vm_offset_t sva; vm_offset_t eva; }; static void pmap_invalidate_range_action(void *arg) { struct pmap_invalidate_range_arg *p = arg; tlb_invalidate_range(p->pmap, p->sva, p->eva); } static void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct pmap_invalidate_range_arg arg; arg.pmap = pmap; arg.sva = sva; arg.eva = eva; pmap_call_on_active_cpus(pmap, pmap_invalidate_range_action, &arg); } struct pmap_update_page_arg { pmap_t pmap; vm_offset_t va; pt_entry_t pte; }; static void pmap_update_page_action(void *arg) { struct pmap_update_page_arg *p = arg; tlb_update(p->pmap, p->va, p->pte); } static void pmap_update_page(pmap_t pmap, vm_offset_t va, pt_entry_t pte) { struct pmap_update_page_arg arg; arg.pmap = pmap; arg.va = va; arg.pte = pte; pmap_call_on_active_cpus(pmap, pmap_update_page_action, &arg); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte; vm_offset_t retval = 0; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va); if (pte) { retval = TLBLO_PTE_TO_PA(*pte) | (va & PAGE_MASK); } PMAP_UNLOCK(pmap); return (retval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t pte, *ptep; vm_paddr_t pa; vm_page_t m; m = NULL; PMAP_LOCK(pmap); ptep = pmap_pte(pmap, va); if (ptep != NULL) { pte = *ptep; if (pte_test(&pte, PTE_V) && (!pte_test(&pte, PTE_RO) || (prot & VM_PROT_WRITE) == 0)) { pa = TLBLO_PTE_TO_PA(pte); m = PHYS_TO_VM_PAGE(pa); if (!vm_page_wire_mapped(m)) m = NULL; } } PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * add a wired page to the kva */ void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { pt_entry_t *pte; pt_entry_t opte, npte; #ifdef PMAP_DEBUG printf("pmap_kenter: va: %p -> pa: %p\n", (void *)va, (void *)pa); #endif pte = pmap_pte(kernel_pmap, va); opte = *pte; npte = TLBLO_PA_TO_PFN(pa) | PTE_C(ma) | PTE_D | PTE_V | PTE_G; *pte = npte; if (pte_test(&opte, PTE_V) && opte != npte) pmap_update_page(kernel_pmap, va, npte); } void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { KASSERT(is_cacheable_mem(pa), ("pmap_kenter: memory at 0x%lx is not cacheable", (u_long)pa)); pmap_kenter_attr(va, pa, VM_MEMATTR_DEFAULT); } void pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa) { KASSERT((size & PAGE_MASK) == 0, ("%s: device mapping not page-sized", __func__)); for (; size > 0; size -= PAGE_SIZE) { /* * XXXCEM: this is somewhat inefficient on SMP systems in that * every single page is individually TLB-invalidated via * rendezvous (pmap_update_page()), instead of invalidating the * entire range via a single rendezvous. */ pmap_kenter_attr(va, pa, VM_MEMATTR_UNCACHEABLE); va += PAGE_SIZE; pa += PAGE_SIZE; } } void pmap_kremove_device(vm_offset_t va, vm_size_t size) { KASSERT((size & PAGE_MASK) == 0, ("%s: device mapping not page-sized", __func__)); /* * XXXCEM: Similar to pmap_kenter_device, this is inefficient on SMP, * in that pages are invalidated individually instead of a single range * rendezvous. */ for (; size > 0; size -= PAGE_SIZE) { pmap_kremove(va); va += PAGE_SIZE; } } /* * remove a page from the kernel pagetables */ /* PMAP_INLINE */ void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; /* * Write back all caches from the page being destroyed */ mips_dcache_wbinv_range_index(va, PAGE_SIZE); pte = pmap_pte(kernel_pmap, va); *pte = PTE_G; pmap_invalidate_page(kernel_pmap, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; if (MIPS_DIRECT_MAPPABLE(end - 1)) return (MIPS_PHYS_TO_DIRECT(start)); va = sva = *virt; while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(vm_offset_t va, vm_page_t *m, int count) { int i; vm_offset_t origva = va; for (i = 0; i < count; i++) { pmap_flush_pvcache(m[i]); pmap_kenter(va, VM_PAGE_TO_PHYS(m[i])); va += PAGE_SIZE; } mips_dcache_wbinv_range_index(origva, PAGE_SIZE*count); } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t va, int count) { pt_entry_t *pte; vm_offset_t origva; if (count < 1) return; mips_dcache_wbinv_range_index(va, PAGE_SIZE * count); origva = va; do { pte = pmap_pte(kernel_pmap, va); *pte = PTE_G; va += PAGE_SIZE; } while (--count > 0); pmap_invalidate_range(kernel_pmap, origva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Decrements a page table page's reference count, which is used to record the * number of valid page table entries within the page. If the reference count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static PMAP_INLINE boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m) { --m->ref_count; if (m->ref_count == 0) { _pmap_unwire_ptp(pmap, va, m); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m) { pd_entry_t *pde; vm_offset_t sva, eva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ #ifdef __mips_n64 if (m->pindex < NUPDE) { pde = pmap_pde(pmap, va); sva = va & ~PDRMASK; eva = sva + NBPDR; } else { pde = pmap_segmap(pmap, va); sva = va & ~SEGMASK; eva = sva + NBSEG; } #else pde = pmap_pde(pmap, va); sva = va & ~SEGMASK; eva = sva + NBSEG; #endif *pde = 0; pmap->pm_stats.resident_count--; #ifdef __mips_n64 if (m->pindex < NUPDE) { pd_entry_t *pdp; vm_page_t pdpg; /* * Recursively decrement next level pagetable refcount. * Either that shoots down a larger range from TLBs (below) * or we're to shoot down just the page in question. */ pdp = (pd_entry_t *)*pmap_segmap(pmap, va); pdpg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(pdp)); if (!pmap_unwire_ptp(pmap, va, pdpg)) { pmap_invalidate_range(pmap, sva, eva); } } else { /* Segmap entry shootdown */ pmap_invalidate_range(pmap, sva, eva); } #else /* Segmap entry shootdown */ pmap_invalidate_range(pmap, sva, eva); #endif /* * If the page is finally unwired, simply free it. */ vm_page_free_zero(m); vm_wire_sub(1); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the reference count. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(pde != 0, ("pmap_unuse_pt: pde != 0")); mpte = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(pde)); return (pmap_unwire_ptp(pmap, va, mpte)); } void pmap_pinit0(pmap_t pmap) { int i; PMAP_LOCK_INIT(pmap); pmap->pm_segtab = kernel_segmap; CPU_ZERO(&pmap->pm_active); for (i = 0; i < MAXCPU; i++) { pmap->pm_asid[i].asid = PMAP_ASID_RESERVED; pmap->pm_asid[i].gen = 0; } PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } static void pmap_grow_direct_page(int req) { #ifdef __mips_n64 vm_wait(NULL); #else if (!vm_page_reclaim_contig(req, 1, 0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0)) vm_wait(NULL); #endif } static vm_page_t pmap_alloc_direct_page(unsigned int index, int req) { vm_page_t m; m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, req | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) return (NULL); if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->pindex = index; return (m); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { vm_offset_t ptdva; vm_page_t ptdpg; int i, req_class; /* * allocate the page directory page */ req_class = VM_ALLOC_NORMAL; while ((ptdpg = pmap_alloc_direct_page(NUSERPGTBLS, req_class)) == NULL) pmap_grow_direct_page(req_class); ptdva = MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(ptdpg)); pmap->pm_segtab = (pd_entry_t *)ptdva; CPU_ZERO(&pmap->pm_active); for (i = 0; i < MAXCPU; i++) { pmap->pm_asid[i].asid = PMAP_ASID_RESERVED; pmap->pm_asid[i].gen = 0; } TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, u_int flags) { vm_offset_t pageva; vm_page_t m; int req_class; /* * Find or fabricate a new pagetable page */ req_class = VM_ALLOC_NORMAL; if ((m = pmap_alloc_direct_page(ptepindex, req_class)) == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); pmap_grow_direct_page(req_class); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page * table page may have been allocated. */ return (NULL); } /* * Map the pagetable page into the process address space, if it * isn't already there. */ pageva = MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); #ifdef __mips_n64 if (ptepindex >= NUPDE) { pmap->pm_segtab[ptepindex - NUPDE] = (pd_entry_t)pageva; } else { pd_entry_t *pdep, *pde; int segindex = ptepindex >> (SEGSHIFT - PDRSHIFT); int pdeindex = ptepindex & (NPDEPG - 1); vm_page_t pg; pdep = &pmap->pm_segtab[segindex]; if (*pdep == NULL) { /* recurse for allocating page dir */ if (_pmap_allocpte(pmap, NUPDE + segindex, flags) == NULL) { /* alloc failed, release current */ vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { pg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(*pdep)); pg->ref_count++; } /* Next level entry */ pde = (pd_entry_t *)*pdep; pde[pdeindex] = (pd_entry_t)pageva; } #else pmap->pm_segtab[ptepindex] = (pd_entry_t)pageva; #endif pmap->pm_stats.resident_count++; return (m); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) { unsigned ptepindex; pd_entry_t *pde; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); retry: /* * Get the page directory entry */ pde = pmap_pde(pmap, va); /* * If the page table page is mapped, we just increment the hold * count, and activate it. */ if (pde != NULL && *pde != NULL) { m = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(*pde)); m->ref_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_offset_t ptdva; vm_page_t ptdpg; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); ptdva = (vm_offset_t)pmap->pm_segtab; ptdpg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(ptdva)); vm_page_unwire_noq(ptdpg); vm_page_free_zero(ptdpg); } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_page_t nkpg; pd_entry_t *pde, *pdpe; pt_entry_t *pte; int i, req_class; mtx_assert(&kernel_map->system_mtx, MA_OWNED); req_class = VM_ALLOC_INTERRUPT; addr = roundup2(addr, NBSEG); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { pdpe = pmap_segmap(kernel_pmap, kernel_vm_end); #ifdef __mips_n64 if (*pdpe == 0) { /* new intermediate page table entry */ nkpg = pmap_alloc_direct_page(nkpt, req_class); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); *pdpe = (pd_entry_t)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(nkpg)); continue; /* try again */ } #endif pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); if (*pde != 0) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } /* * This index is bogus, but out of the way */ nkpg = pmap_alloc_direct_page(nkpt, req_class); #ifndef __mips_n64 if (nkpg == NULL && vm_page_reclaim_contig(req_class, 1, 0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0)) nkpg = pmap_alloc_direct_page(nkpt, req_class); #endif if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; *pde = (pd_entry_t)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(nkpg)); /* * The R[4-7]?00 stores only one copy of the Global bit in * the translation lookaside buffer for each 2 page entry. * Thus invalid entrys must have the Global bit set so when * Entry LO and Entry HI G bits are anded together they will * produce a global bit to store in the tlb. */ pte = (pt_entry_t *)*pde; for (i = 0; i < NPTEPG; i++) pte[i] = PTE_G; kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); #ifdef __mips_n64 CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); #else CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); #endif static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #ifdef __mips_n64 #define PC_FREE0_1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful #else #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ #endif static const u_long pc_freemask[_NPCM] = { #ifdef __mips_n64 PC_FREE0_1, PC_FREE0_1, PC_FREE2 #else PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 #endif }; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM/pmap parameters"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, oldpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; u_long inuse; int bit, field, freed, idx; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = ffsl(inuse) - 1; idx = field * sizeof(inuse) * NBBY + bit; pv = &pc->pc_pventry[idx]; va = pv->pv_va; pde = pmap_pde(pmap, va); KASSERT(pde != NULL && *pde != 0, ("pmap_pv_reclaim: pde")); pte = pmap_pde_to_pte(pde, va); oldpte = *pte; if (pte_test(&oldpte, PTE_W)) continue; if (is_kernel_pmap(pmap)) *pte = PTE_G; else *pte = 0; m = PHYS_TO_VM_PAGE(TLBLO_PTE_TO_PA(oldpte)); if (pte_test(&oldpte, PTE_D)) vm_page_dirty(m); if (m->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(m, PGA_REFERENCED); m->md.pv_flags &= ~PV_TABLE_REF; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pc->pc_map[field] |= 1UL << bit; /* * For simplicity, we will unconditionally shoot * down TLBs either at the end of this function * or at the top of the loop above if we switch * to a different pmap. */ (void)pmap_unuse_pt(pmap, va, *pde); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS( (vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int bit, field, idx; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / (sizeof(u_long) * NBBY); bit = idx % (sizeof(u_long) * NBBY); pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { struct pv_chunk *pc; pv_entry_t pv; vm_page_t m; int bit, field, idx; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { idx = field * sizeof(pc->pc_map[field]) * NBBY + bit; pv = &pc->pc_pventry[idx]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, VM_ALLOC_NORMAL | VM_ALLOC_WIRED); if (m == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); dump_add_page(m->phys_addr); pc = (struct pv_chunk *)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); break; } } return (pv); } static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found, pa %lx va %lx", (u_long)VM_PAGE_TO_PHYS(__containerof(pvh, struct vm_page, md)), (u_long)va)); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_page_t mpte, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); return (TRUE); } else return (FALSE); } /* * pmap_remove_pte: do the things to unmap a page in a process * * Returns true if this was the last PTE in the PT (and possibly the last PT in * the PD, and possibly the last PD in the segmap), in which case... * * 1) the TLB has been invalidated for the whole PT's span (at least), * already, to ensure that MipsDoTLBMiss does not attempt to follow a * dangling pointer into a freed page. No additional TLB shootdown is * required. * * 2) if this removal was part of a sweep to remove PTEs, it is safe to jump * to the PT span boundary and continue. * * 3) The given pde may now point onto a freed page and must not be * dereferenced * * If the return value is false, the TLB has not been shot down (and the segmap * entry, PD, and PT all remain in place). */ static int pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t pde) { pt_entry_t oldpte; vm_page_t m; vm_paddr_t pa; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Write back all cache lines from the page being unmapped. */ mips_dcache_wbinv_range_index(va, PAGE_SIZE); oldpte = *ptq; if (is_kernel_pmap(pmap)) *ptq = PTE_G; else *ptq = 0; if (pte_test(&oldpte, PTE_W)) pmap->pm_stats.wired_count -= 1; pmap->pm_stats.resident_count -= 1; if (pte_test(&oldpte, PTE_MANAGED)) { pa = TLBLO_PTE_TO_PA(oldpte); m = PHYS_TO_VM_PAGE(pa); if (pte_test(&oldpte, PTE_D)) { KASSERT(!pte_test(&oldpte, PTE_RO), ("%s: modified page not writable: va: %p, pte: %#jx", __func__, (void *)va, (uintmax_t)oldpte)); vm_page_dirty(m); } if (m->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(m, PGA_REFERENCED); m->md.pv_flags &= ~PV_TABLE_REF; pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, pde)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(struct pmap *pmap, vm_offset_t va) { pd_entry_t *pde; pt_entry_t *ptq; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde = pmap_pde(pmap, va); if (pde == NULL || *pde == 0) return; ptq = pmap_pde_to_pte(pde, va); /* * If there is no pte for this address, just skip it! */ if (!pte_test(ptq, PTE_V)) return; /* * Remove this PTE from the PT. If this is the last one, then * the TLB has already been shot down, so don't bother again */ if (!pmap_remove_pte(pmap, ptq, va, *pde)) pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t va_next; vm_offset_t va_init, va_fini; bool need_tlb_shootdown; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); /* * special handling of removing one page. a very common operation * and easy to short circuit some code. */ if ((sva + PAGE_SIZE) == eva) { pmap_remove_page(pmap, sva); goto out; } for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif /* Scan up to the end of the page table pointed to by pde */ va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; need_tlb_shootdown = false; va_init = sva; va_fini = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { /* Skip over invalid entries; no need to shootdown */ if (!pte_test(pte, PTE_V)) { /* * If we have not yet found a valid entry, then * we can move the lower edge of the region to * invalidate to the next PTE. */ if (!need_tlb_shootdown) va_init = sva + PAGE_SIZE; continue; } /* * A valid entry; the range we are shooting down must * include this page. va_fini is used instead of sva * so that if the range ends with a run of !PTE_V PTEs, * but doesn't clear out so much that pmap_remove_pte * removes the entire PT, we won't include these !PTE_V * entries in the region to be shot down. */ va_fini = sva + PAGE_SIZE; if (pmap_remove_pte(pmap, pte, sva, *pde)) { /* Entire PT removed and TLBs shot down. */ need_tlb_shootdown = false; break; } else { need_tlb_shootdown = true; } } if (need_tlb_shootdown) pmap_invalidate_range(pmap, va_init, va_fini); } out: rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pd_entry_t *pde; pt_entry_t *pte, tpte; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); rw_wlock(&pvh_global_lock); if (m->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(m, PGA_REFERENCED); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); /* * If it's last mapping writeback all caches from * the page being destroyed */ if (TAILQ_NEXT(pv, pv_list) == NULL) mips_dcache_wbinv_range_index(pv->pv_va, PAGE_SIZE); pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, pv->pv_va); KASSERT(pde != NULL && *pde != 0, ("pmap_remove_all: pde")); pte = pmap_pde_to_pte(pde, pv->pv_va); tpte = *pte; if (is_kernel_pmap(pmap)) *pte = PTE_G; else *pte = 0; if (pte_test(&tpte, PTE_W)) pmap->pm_stats.wired_count--; /* * Update the vm_page_t clean and reference bits. */ if (pte_test(&tpte, PTE_D)) { KASSERT(!pte_test(&tpte, PTE_RO), ("%s: modified page not writable: va: %p, pte: %#jx", __func__, (void *)pv->pv_va, (uintmax_t)tpte)); vm_page_dirty(m); } if (!pmap_unuse_pt(pmap, pv->pv_va, *pde)) pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); m->md.pv_flags &= ~PV_TABLE_REF; rw_wunlock(&pvh_global_lock); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { pt_entry_t pbits, *pte; pd_entry_t *pde, *pdpe; vm_offset_t va, va_next; vm_paddr_t pa; vm_page_t m; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being write protected. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { pbits = *pte; if (!pte_test(&pbits, PTE_V) || pte_test(&pbits, PTE_RO)) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } pte_set(&pbits, PTE_RO); if (pte_test(&pbits, PTE_D)) { pte_clear(&pbits, PTE_D); if (pte_test(&pbits, PTE_MANAGED)) { pa = TLBLO_PTE_TO_PA(pbits); m = PHYS_TO_VM_PAGE(pa); vm_page_dirty(m); } if (va == va_next) va = sva; } else { /* * Unless PTE_D is set, any TLB entries * mapping "sva" don't allow write access, so * they needn't be invalidated. */ if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } *pte = pbits; } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind __unused) { vm_paddr_t pa, opa; pt_entry_t *pte; pt_entry_t origpte, newpte; pv_entry_t pv; vm_page_t mpte, om; va &= ~PAGE_MASK; KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); pa = VM_PAGE_TO_PHYS(m); newpte = TLBLO_PA_TO_PFN(pa) | init_pte_prot(m, flags, prot); if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PTE_W; if (is_kernel_pmap(pmap)) newpte |= PTE_G; PMAP_PTE_SET_CACHE_BITS(newpte, pa, m); if ((m->oflags & VPO_UNMANAGED) == 0) newpte |= PTE_MANAGED; mpte = NULL; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); /* * In the case that a page table page is not resident, we are * creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va, flags); if (mpte == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte failed with sleep allowed")); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } } pte = pmap_pte(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory, pdir=%p, va=%p", (void *)pmap->pm_segtab, (void *)va); } origpte = *pte; KASSERT(!pte_test(&origpte, PTE_D | PTE_RO | PTE_V), ("pmap_enter: modified page not writable: va: %p, pte: %#jx", (void *)va, (uintmax_t)origpte)); opa = TLBLO_PTE_TO_PA(origpte); /* * Mapping has not changed, must be protection or wiring change. */ if (pte_test(&origpte, PTE_V) && opa == pa) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is * wired, the PT page will be also. */ if (pte_test(&newpte, PTE_W) && !pte_test(&origpte, PTE_W)) pmap->pm_stats.wired_count++; else if (!pte_test(&newpte, PTE_W) && pte_test(&origpte, PTE_W)) pmap->pm_stats.wired_count--; /* * Remove extra pte reference */ if (mpte) mpte->ref_count--; if (pte_test(&origpte, PTE_MANAGED)) { m->md.pv_flags |= PV_TABLE_REF; if (!pte_test(&newpte, PTE_RO)) vm_page_aflag_set(m, PGA_WRITEABLE); } goto validate; } pv = NULL; /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (is_kernel_pmap(pmap)) *pte = PTE_G; else *pte = 0; if (pte_test(&origpte, PTE_W)) pmap->pm_stats.wired_count--; if (pte_test(&origpte, PTE_MANAGED)) { om = PHYS_TO_VM_PAGE(opa); if (pte_test(&origpte, PTE_D)) vm_page_dirty(om); if ((om->md.pv_flags & PV_TABLE_REF) != 0) { om->md.pv_flags &= ~PV_TABLE_REF; vm_page_aflag_set(om, PGA_REFERENCED); } pv = pmap_pvh_remove(&om->md, pmap, va); if (!pte_test(&newpte, PTE_MANAGED)) free_pv_entry(pmap, pv); if ((om->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list)) vm_page_aflag_clear(om, PGA_WRITEABLE); } pmap_invalidate_page(pmap, va); origpte = 0; if (mpte != NULL) { mpte->ref_count--; KASSERT(mpte->ref_count > 0, ("pmap_enter: missing reference to page table page," " va: %p", (void *)va)); } } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if (pte_test(&newpte, PTE_MANAGED)) { m->md.pv_flags |= PV_TABLE_REF; if (pv == NULL) { pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; } TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); if (!pte_test(&newpte, PTE_RO)) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Increment counters */ if (pte_test(&newpte, PTE_W)) pmap->pm_stats.wired_count++; validate: #ifdef PMAP_DEBUG printf("pmap_enter: va: %p -> pa: %p\n", (void *)va, (void *)pa); #endif /* * if the mapping or permission bits are different, we need to * update the pte. */ if (origpte != newpte) { *pte = newpte; if (pte_test(&origpte, PTE_V)) { KASSERT(opa == pa, ("pmap_enter: invalid update")); if (pte_test(&origpte, PTE_D)) { if (pte_test(&origpte, PTE_MANAGED)) vm_page_dirty(m); } pmap_update_page(pmap, va, newpte); } } /* * Sync I & D caches for executable pages. Do this only if the * target pmap belongs to the current process. Otherwise, an * unresolvable TLB miss may occur. */ if (!is_kernel_pmap(pmap) && (pmap == &curproc->p_vmspace->vm_pmap) && (prot & VM_PROT_EXECUTE)) { mips_icache_sync_range(va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t *pte, npte; vm_paddr_t pa; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not resident, we are * creating it here. */ if (va < VM_MAXUSER_ADDRESS) { pd_entry_t *pde; unsigned ptepindex; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->ref_count++; } else { /* * Get the page directory entry */ pde = pmap_pde(pmap, va); /* * If the page table page is mapped, we just * increment the hold count, and activate it. */ if (pde && *pde != 0) { mpte = PHYS_TO_VM_PAGE( MIPS_DIRECT_TO_PHYS(*pde)); mpte->ref_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } pte = pmap_pte(pmap, va); if (pte_test(pte, PTE_V)) { if (mpte != NULL) { mpte->ref_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, mpte, va, m)) { if (mpte != NULL) { pmap_unwire_ptp(pmap, va, mpte); mpte = NULL; } return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m); /* * Now validate mapping with RO protection */ npte = PTE_RO | TLBLO_PA_TO_PFN(pa) | PTE_V; if ((m->oflags & VPO_UNMANAGED) == 0) npte |= PTE_MANAGED; PMAP_PTE_SET_CACHE_BITS(npte, pa, m); if (is_kernel_pmap(pmap)) *pte = npte | PTE_G; else { *pte = npte; /* * Sync I & D caches. Do this only if the target pmap * belongs to the current process. Otherwise, an * unresolvable TLB miss may occur. */ if (pmap == &curproc->p_vmspace->vm_pmap) { va &= ~PAGE_MASK; mips_icache_sync_range(va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); } } return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; if (i != 0) printf("%s: ERROR!!! More than one page of virtual address mapping not supported\n", __func__); if (MIPS_DIRECT_MAPPABLE(pa)) { va = MIPS_PHYS_TO_DIRECT(pa); } else { #ifndef __mips_n64 /* XXX : to be converted to new style */ pt_entry_t *pte, npte; pte = pmap_pte(kernel_pmap, crashdumpva); /* Since this is for the debugger, no locks or any other fun */ npte = TLBLO_PA_TO_PFN(pa) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; *pte = npte; pmap_update_page(kernel_pmap, crashdumpva, npte); va = crashdumpva; #endif } return ((void *)va); } void pmap_kenter_temporary_free(vm_paddr_t pa) { #ifndef __mips_n64 /* XXX : to be converted to new style */ pt_entry_t *pte; #endif if (MIPS_DIRECT_MAPPABLE(pa)) { /* nothing to do for this case */ return; } #ifndef __mips_n64 /* XXX : to be converted to new style */ pte = pmap_pte(kernel_pmap, crashdumpva); *pte = PTE_G; pmap_invalidate_page(kernel_pmap, crashdumpva); #endif } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot, mpte); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t va_next; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == NULL) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_V)) continue; if (!pte_test(pte, PTE_W)) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); pte_clear(pte, PTE_W); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va; vm_paddr_t phys = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(phys)) { va = MIPS_PHYS_TO_DIRECT(phys); bzero((caddr_t)va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); } else { va = pmap_lmem_map1(phys); bzero((caddr_t)va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); pmap_lmem_unmap(); } } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va; vm_paddr_t phys = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(phys)) { va = MIPS_PHYS_TO_DIRECT(phys); bzero((char *)(caddr_t)va + off, size); mips_dcache_wbinv_range(va + off, size); } else { va = pmap_lmem_map1(phys); bzero((char *)va + off, size); mips_dcache_wbinv_range(va + off, size); pmap_lmem_unmap(); } } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { vm_offset_t va_src, va_dst; vm_paddr_t phys_src = VM_PAGE_TO_PHYS(src); vm_paddr_t phys_dst = VM_PAGE_TO_PHYS(dst); if (MIPS_DIRECT_MAPPABLE(phys_src) && MIPS_DIRECT_MAPPABLE(phys_dst)) { /* easy case, all can be accessed via KSEG0 */ /* * Flush all caches for VA that are mapped to this page * to make sure that data in SDRAM is up to date */ pmap_flush_pvcache(src); mips_dcache_wbinv_range_index( MIPS_PHYS_TO_DIRECT(phys_dst), PAGE_SIZE); va_src = MIPS_PHYS_TO_DIRECT(phys_src); va_dst = MIPS_PHYS_TO_DIRECT(phys_dst); bcopy((caddr_t)va_src, (caddr_t)va_dst, PAGE_SIZE); mips_dcache_wbinv_range(va_dst, PAGE_SIZE); } else { va_src = pmap_lmem_map2(phys_src, phys_dst); va_dst = va_src + PAGE_SIZE; bcopy((void *)va_src, (void *)va_dst, PAGE_SIZE); mips_dcache_wbinv_range(va_dst, PAGE_SIZE); pmap_lmem_unmap(); } } int unmapped_buf_allowed; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { char *a_cp, *b_cp; vm_page_t a_m, b_m; vm_offset_t a_pg_offset, b_pg_offset; vm_paddr_t a_phys, b_phys; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); a_m = ma[a_offset >> PAGE_SHIFT]; a_phys = VM_PAGE_TO_PHYS(a_m); b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); b_m = mb[b_offset >> PAGE_SHIFT]; b_phys = VM_PAGE_TO_PHYS(b_m); if (MIPS_DIRECT_MAPPABLE(a_phys) && MIPS_DIRECT_MAPPABLE(b_phys)) { pmap_flush_pvcache(a_m); mips_dcache_wbinv_range_index( MIPS_PHYS_TO_DIRECT(b_phys), PAGE_SIZE); a_cp = (char *)MIPS_PHYS_TO_DIRECT(a_phys) + a_pg_offset; b_cp = (char *)MIPS_PHYS_TO_DIRECT(b_phys) + b_pg_offset; bcopy(a_cp, b_cp, cnt); mips_dcache_wbinv_range((vm_offset_t)b_cp, cnt); } else { a_cp = (char *)pmap_lmem_map2(a_phys, b_phys); b_cp = (char *)a_cp + PAGE_SIZE; a_cp += a_pg_offset; b_cp += b_pg_offset; bcopy(a_cp, b_cp, cnt); mips_dcache_wbinv_range((vm_offset_t)b_cp, cnt); pmap_lmem_unmap(); } a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { #if defined(__mips_n64) return MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); #else vm_offset_t qaddr; vm_paddr_t pa; pt_entry_t *pte, npte; pa = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(pa)) { if (pmap_page_get_memattr(m) != VM_MEMATTR_WRITE_BACK) return (MIPS_PHYS_TO_DIRECT_UNCACHED(pa)); else return (MIPS_PHYS_TO_DIRECT(pa)); } critical_enter(); qaddr = PCPU_GET(qmap_addr); pte = PCPU_GET(qmap_ptep); KASSERT(*pte == PTE_G, ("pmap_quick_enter_page: PTE busy")); npte = TLBLO_PA_TO_PFN(pa) | PTE_D | PTE_V | PTE_G; PMAP_PTE_SET_CACHE_BITS(npte, pa, m); *pte = npte; return (qaddr); #endif } void pmap_quick_remove_page(vm_offset_t addr) { mips_dcache_wbinv_range(addr, PAGE_SIZE); #if !defined(__mips_n64) pt_entry_t *pte; if (addr >= MIPS_KSEG0_START && addr < MIPS_KSEG0_END) return; pte = PCPU_GET(qmap_ptep); KASSERT(*pte != PTE_G, ("pmap_quick_remove_page: PTE not in use")); KASSERT(PCPU_GET(qmap_addr) == addr, ("pmap_quick_remove_page: invalid address")); *pte = PTE_G; tlb_invalidate_address(kernel_pmap, addr); critical_exit(); #endif } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } rw_wunlock(&pvh_global_lock); return (rv); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t *pde; pt_entry_t *pte, tpte; pv_entry_t pv; vm_page_t m; struct pv_chunk *pc, *npc; u_long inuse, bitmask; int allfree, bit, field, idx; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * sizeof(inuse) * NBBY + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pde = pmap_pde(pmap, pv->pv_va); KASSERT(pde != NULL && *pde != 0, ("pmap_remove_pages: pde")); pte = pmap_pde_to_pte(pde, pv->pv_va); if (!pte_test(pte, PTE_V)) panic("pmap_remove_pages: bad pte"); tpte = *pte; /* * We cannot remove wired pages from a process' mapping at this time */ if (pte_test(&tpte, PTE_W)) { allfree = 0; continue; } *pte = is_kernel_pmap(pmap) ? PTE_G : 0; m = PHYS_TO_VM_PAGE(TLBLO_PTE_TO_PA(tpte)); KASSERT(m != NULL, ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); /* * Update the vm_page_t clean and reference bits. */ if (pte_test(&tpte, PTE_D)) vm_page_dirty(m); /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); /* * For simplicity, unconditionally call * pmap_invalidate_all(), below. */ (void)pmap_unuse_pt(pmap, pv->pv_va, *pde); } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); } /* * pmap_testbit tests bits in pte's */ static boolean_t pmap_testbit(vm_page_t m, int bit) { pv_entry_t pv; pmap_t pmap; pt_entry_t *pte; boolean_t rv = FALSE; if (m->oflags & VPO_UNMANAGED) return (rv); rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); rv = pte_test(pte, bit); PMAP_UNLOCK(pmap); if (rv) break; } return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pt_entry_t *pte; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); if (pte_test(pte, PTE_W)) count++; PMAP_UNLOCK(pmap); } rw_wunlock(&pvh_global_lock); return (count); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { pmap_t pmap; pt_entry_t pbits, *pte; pv_entry_t pv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); KASSERT(pte != NULL && pte_test(pte, PTE_V), ("page on pv_list has no pte")); pbits = *pte; if (pte_test(&pbits, PTE_D)) { pte_clear(&pbits, PTE_D); vm_page_dirty(m); } pte_set(&pbits, PTE_RO); if (pbits != *pte) { *pte = pbits; pmap_update_page(pmap, pv->pv_va, pbits); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return the count of reference bits for a page, clearing all of them. */ int pmap_ts_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); if (m->md.pv_flags & PV_TABLE_REF) { rw_wlock(&pvh_global_lock); m->md.pv_flags &= ~PV_TABLE_REF; rw_wunlock(&pvh_global_lock); return (1); } return (0); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_testbit(m, PTE_D); rw_wunlock(&pvh_global_lock); return (rv); } /* N/C */ /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (pde != NULL && *pde != 0) { pte = pmap_pde_to_pte(pde, addr); rv = (*pte == 0); } PMAP_UNLOCK(pmap); return (rv); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t va, va_next; vm_paddr_t pa; vm_page_t m; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being write protected. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_MANAGED | PTE_V)) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } pa = TLBLO_PTE_TO_PA(*pte); m = PHYS_TO_VM_PAGE(pa); m->md.pv_flags &= ~PV_TABLE_REF; if (pte_test(pte, PTE_D)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ vm_page_dirty(m); } else { pte_clear(pte, PTE_D); if (va == va_next) va = sva; } } else { /* * Unless PTE_D is set, any TLB entries * mapping "sva" don't allow write access, so * they needn't be invalidated. */ if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); if (pte_test(pte, PTE_D)) { pte_clear(pte, PTE_D); pmap_update_page(pmap, pv->pv_va, *pte); } PMAP_UNLOCK(pmap); } rw_wunlock(&pvh_global_lock); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return ((m->md.pv_flags & PV_TABLE_REF) != 0); } /* * Miscellaneous support routines follow */ /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. * * Use XKPHYS uncached for 64 bit, and KSEG1 where possible for 32 bit. */ void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { vm_offset_t va, tmpva, offset; /* * KSEG1 maps only first 512M of phys address space. For * pa > 0x20000000 we should make proper mapping * using pmap_kenter. */ if (MIPS_DIRECT_MAPPABLE(pa + size - 1) && ma == VM_MEMATTR_UNCACHEABLE) return ((void *)MIPS_PHYS_TO_DIRECT_UNCACHED(pa)); else { offset = pa & PAGE_MASK; size = roundup(size + offset, PAGE_SIZE); va = kva_alloc(size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pa = trunc_page(pa); for (tmpva = va; size > 0;) { pmap_kenter_attr(tmpva, pa, ma); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } } return ((void *)(va + offset)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return pmap_mapdev_attr(pa, size, VM_MEMATTR_UNCACHEABLE); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { #ifndef __mips_n64 vm_offset_t base, offset; /* If the address is within KSEG1 then there is nothing to do */ if (va >= MIPS_KSEG1_START && va <= MIPS_KSEG1_END) return; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(size + offset, PAGE_SIZE); pmap_qremove(base, atop(size)); kva_free(base, size); #endif } /* * Perform the pmap work for mincore(2). If the page is not both referenced and * modified by this pmap, returns its physical address so that the caller can * find other mappings. */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { pt_entry_t *ptep, pte; vm_paddr_t pa; vm_page_t m; int val; PMAP_LOCK(pmap); ptep = pmap_pte(pmap, addr); pte = (ptep != NULL) ? *ptep : 0; if (!pte_test(&pte, PTE_V)) { PMAP_UNLOCK(pmap); return (0); } val = MINCORE_INCORE; if (pte_test(&pte, PTE_D)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; pa = TLBLO_PTE_TO_PA(pte); if (pte_test(&pte, PTE_MANAGED)) { /* * This may falsely report the given address as * MINCORE_REFERENCED. Unfortunately, due to the lack of * per-PTE reference information, it is impossible to * determine if the address is MINCORE_REFERENCED. */ m = PHYS_TO_VM_PAGE(pa); if ((m->a.flags & PGA_REFERENCED) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && pte_test(&pte, PTE_MANAGED)) { *pap = pa; } PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; struct proc *p = td->td_proc; u_int cpuid; critical_enter(); pmap = vmspace_pmap(p->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); if (oldpmap) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); pmap_asid_alloc(pmap); if (td == curthread) { PCPU_SET(segbase, pmap->pm_segtab); mips_wr_entryhi(pmap->pm_asid[cpuid].asid); } PCPU_SET(curpmap, pmap); critical_exit(); } static void pmap_sync_icache_one(void *arg __unused) { mips_icache_sync_all(); mips_dcache_wbinv_all(); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { smp_rendezvous(NULL, pmap_sync_icache_one, NULL, NULL); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < PDRSIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((PDRSIZE - superpage_offset) & PDRMASK) < PDRSIZE || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } #ifdef DDB DB_SHOW_COMMAND(ptable, ddb_pid_dump) { pmap_t pmap; struct thread *td = NULL; struct proc *p; int i, j, k; vm_paddr_t pa; vm_offset_t va; if (have_addr) { td = db_lookup_thread(addr, true); if (td == NULL) { db_printf("Invalid pid or tid"); return; } p = td->td_proc; if (p->p_vmspace == NULL) { db_printf("No vmspace for process"); return; } pmap = vmspace_pmap(p->p_vmspace); } else pmap = kernel_pmap; db_printf("pmap:%p segtab:%p asid:%x generation:%x\n", pmap, pmap->pm_segtab, pmap->pm_asid[0].asid, pmap->pm_asid[0].gen); for (i = 0; i < NPDEPG; i++) { pd_entry_t *pdpe; pt_entry_t *pde; pt_entry_t pte; pdpe = (pd_entry_t *)pmap->pm_segtab[i]; if (pdpe == NULL) continue; db_printf("[%4d] %p\n", i, pdpe); #ifdef __mips_n64 for (j = 0; j < NPDEPG; j++) { pde = (pt_entry_t *)pdpe[j]; if (pde == NULL) continue; db_printf("\t[%4d] %p\n", j, pde); #else { j = 0; pde = (pt_entry_t *)pdpe; #endif for (k = 0; k < NPTEPG; k++) { pte = pde[k]; if (pte == 0 || !pte_test(&pte, PTE_V)) continue; pa = TLBLO_PTE_TO_PA(pte); va = ((u_long)i << SEGSHIFT) | (j << PDRSHIFT) | (k << PAGE_SHIFT); db_printf("\t\t[%04d] va: %p pte: %8jx pa:%jx\n", k, (void *)va, (uintmax_t)pte, (uintmax_t)pa); } } } } #endif /* * Allocate TLB address space tag (called ASID or TLBPID) and return it. * It takes almost as much or more time to search the TLB for a * specific ASID and flush those entries as it does to flush the entire TLB. * Therefore, when we allocate a new ASID, we just take the next number. When * we run out of numbers, we flush the TLB, increment the generation count * and start over. ASID zero is reserved for kernel use. */ static void pmap_asid_alloc(pmap) pmap_t pmap; { if (pmap->pm_asid[PCPU_GET(cpuid)].asid != PMAP_ASID_RESERVED && pmap->pm_asid[PCPU_GET(cpuid)].gen == PCPU_GET(asid_generation)); else { if (PCPU_GET(next_asid) == pmap_max_asid) { tlb_invalidate_all_user(NULL); PCPU_SET(asid_generation, (PCPU_GET(asid_generation) + 1) & ASIDGEN_MASK); if (PCPU_GET(asid_generation) == 0) { PCPU_SET(asid_generation, 1); } PCPU_SET(next_asid, 1); /* 0 means invalid */ } pmap->pm_asid[PCPU_GET(cpuid)].asid = PCPU_GET(next_asid); pmap->pm_asid[PCPU_GET(cpuid)].gen = PCPU_GET(asid_generation); PCPU_SET(next_asid, PCPU_GET(next_asid) + 1); } } static pt_entry_t init_pte_prot(vm_page_t m, vm_prot_t access, vm_prot_t prot) { pt_entry_t rw; if (!(prot & VM_PROT_WRITE)) rw = PTE_V | PTE_RO; else if ((m->oflags & VPO_UNMANAGED) == 0) { if ((access & VM_PROT_WRITE) != 0) rw = PTE_V | PTE_D; else rw = PTE_V; } else /* Needn't emulate a modified bit for unmanaged pages. */ rw = PTE_V | PTE_D; return (rw); } /* * pmap_emulate_modified : do dirty bit emulation * * On SMP, update just the local TLB, other CPUs will update their * TLBs from PTE lazily, if they get the exception. * Returns 0 in case of sucess, 1 if the page is read only and we * need to fault. */ int pmap_emulate_modified(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va); /* * It is possible that some other CPU or thread changed the pmap while * we weren't looking; in the SMP case, this is readily apparent, but * it can even happen in the UP case, because we may have been blocked * on PMAP_LOCK(pmap) above while someone changed this out from * underneath us. */ if (pte == NULL) { /* * This PTE's PTP (or one of its ancestors) has been reclaimed; * trigger a full fault to reconstruct it via pmap_enter. */ PMAP_UNLOCK(pmap); return (1); } if (!pte_test(pte, PTE_V)) { /* * This PTE is no longer valid; the other thread or other * processor must have arranged for our TLB to no longer * have this entry, possibly by IPI, so no tlb_update is * required. Fall out of the fast path and go take a * general fault before retrying the instruction (or taking * a signal). */ PMAP_UNLOCK(pmap); return (1); } if (pte_test(pte, PTE_D)) { /* * This PTE is valid and has the PTE_D bit asserted; since * this is an increase in permission, we may have been expected * to update the TLB lazily. Do so here and return, on the * fast path, to retry the instruction. */ tlb_update(pmap, va, *pte); PMAP_UNLOCK(pmap); return (0); } if (pte_test(pte, PTE_RO)) { /* * This PTE is valid, not dirty, and read-only. Go take a * full fault (most likely to upgrade this part of the address * space to writeable). */ PMAP_UNLOCK(pmap); return (1); } if (!pte_test(pte, PTE_MANAGED)) panic("pmap_emulate_modified: unmanaged page"); /* * PTE is valid, managed, not dirty, and not read-only. Set PTE_D * and eagerly update the local TLB, returning on the fast path. */ pte_set(pte, PTE_D); tlb_update(pmap, va, *pte); PMAP_UNLOCK(pmap); return (0); } /* * Routine: pmap_kextract * Function: * Extract the physical page address associated * virtual address. */ vm_paddr_t pmap_kextract(vm_offset_t va) { int mapped; /* * First, the direct-mapped regions. */ #if defined(__mips_n64) if (va >= MIPS_XKPHYS_START && va < MIPS_XKPHYS_END) return (MIPS_XKPHYS_TO_PHYS(va)); #endif if (va >= MIPS_KSEG0_START && va < MIPS_KSEG0_END) return (MIPS_KSEG0_TO_PHYS(va)); if (va >= MIPS_KSEG1_START && va < MIPS_KSEG1_END) return (MIPS_KSEG1_TO_PHYS(va)); /* * User virtual addresses. */ if (va < VM_MAXUSER_ADDRESS) { pt_entry_t *ptep; if (curproc && curproc->p_vmspace) { ptep = pmap_pte(&curproc->p_vmspace->vm_pmap, va); if (ptep) { return (TLBLO_PTE_TO_PA(*ptep) | (va & PAGE_MASK)); } return (0); } } /* * Should be kernel virtual here, otherwise fail */ mapped = (va >= MIPS_KSEG2_START || va < MIPS_KSEG2_END); #if defined(__mips_n64) mapped = mapped || (va >= MIPS_XKSEG_START || va < MIPS_XKSEG_END); #endif /* * Kernel virtual. */ if (mapped) { pt_entry_t *ptep; /* Is the kernel pmap initialized? */ if (!CPU_EMPTY(&kernel_pmap->pm_active)) { /* It's inside the virtual address range */ ptep = pmap_pte(kernel_pmap, va); if (ptep) { return (TLBLO_PTE_TO_PA(*ptep) | (va & PAGE_MASK)); } } return (0); } panic("%s for unknown address space %p.", __func__, (void *)va); } void pmap_flush_pvcache(vm_page_t m) { pv_entry_t pv; if (m != NULL) { for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { mips_dcache_wbinv_range_index(pv->pv_va, PAGE_SIZE); } } } void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { /* * It appears that this function can only be called before any mappings * for the page are established. If this ever changes, this code will * need to walk the pv_list and make each of the existing mappings * uncacheable, being careful to sync caches and PTEs (and maybe * invalidate TLB?) for any current mapping it modifies. */ if (TAILQ_FIRST(&m->md.pv_list) != NULL) panic("Can't change memattr on page with existing mappings"); /* Clean memattr portion of pv_flags */ m->md.pv_flags &= ~PV_MEMATTR_MASK; m->md.pv_flags |= (ma << PV_MEMATTR_SHIFT) & PV_MEMATTR_MASK; } static __inline void pmap_pte_attr(pt_entry_t *pte, vm_memattr_t ma) { u_int npte; npte = *(u_int *)pte; npte &= ~PTE_C_MASK; npte |= PTE_C(ma); *pte = npte; } int pmap_change_attr(vm_offset_t sva, vm_size_t size, vm_memattr_t ma) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t ova, eva, va, va_next; pmap_t pmap; ova = sva; eva = sva + size; if (eva < sva) return (EINVAL); pmap = kernel_pmap; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_V) || pte_cache_bits(pte) == ma) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } if (va == va_next) va = sva; pmap_pte_attr(pte, ma); } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); /* Flush caches to be in the safe side */ mips_dcache_wbinv_range(ova, size); return 0; } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { switch (mode) { case VM_MEMATTR_UNCACHEABLE: case VM_MEMATTR_WRITE_BACK: #ifdef MIPS_CCA_WC case VM_MEMATTR_WRITE_COMBINING: #endif return (TRUE); default: return (FALSE); } } Index: head/sys/mips/mips/uma_machdep.c =================================================================== --- head/sys/mips/mips/uma_machdep.c (revision 366710) +++ head/sys/mips/mips/uma_machdep.c (revision 366711) @@ -1,97 +1,98 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003 Alan L. Cox * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include +#include #include #include +#include #include #include #include -#include void * uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, int wait) { vm_paddr_t pa; vm_page_t m; int pflags; void *va; *flags = UMA_SLAB_PRIV; pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED; #ifndef __mips_n64 pflags &= ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); pflags |= VM_ALLOC_NOWAIT; #endif for (;;) { m = vm_page_alloc_freelist_domain(domain, VM_FREELIST_DIRECT, pflags); #ifndef __mips_n64 if (m == NULL && vm_page_reclaim_contig(pflags, 1, 0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0)) continue; #endif if (m != NULL) break; if ((wait & M_NOWAIT) != 0) return (NULL); vm_wait(NULL); } pa = VM_PAGE_TO_PHYS(m); if ((wait & M_NODUMP) == 0) dump_add_page(pa); va = (void *)MIPS_PHYS_TO_DIRECT(pa); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero(va, PAGE_SIZE); return (va); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; vm_paddr_t pa; pa = MIPS_DIRECT_TO_PHYS((vm_offset_t)mem); dump_drop_page(pa); m = PHYS_TO_VM_PAGE(pa); vm_page_unwire_noq(m); vm_page_free(m); } Index: head/sys/mips/nlm/xlp_machdep.c =================================================================== --- head/sys/mips/nlm/xlp_machdep.c (revision 366710) +++ head/sys/mips/nlm/xlp_machdep.c (revision 366711) @@ -1,716 +1,717 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright 2003-2011 Netlogic Microsystems (Netlogic). All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY Netlogic Microsystems ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NETLOGIC OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. * * NETLOGIC_BSD */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_platform.h" #include #include #include #include #include #include #include #include #include #include #include #include /* cinit() */ #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef FDT #include #include #endif /* 4KB static data aread to keep a copy of the bootload env until the dynamic kenv is setup */ char boot1_env[4096]; uint64_t xlp_cpu_frequency; uint64_t xlp_io_base = MIPS_PHYS_TO_DIRECT_UNCACHED(XLP_DEFAULT_IO_BASE); int xlp_ncores; int xlp_threads_per_core; uint32_t xlp_hw_thread_mask; int xlp_cpuid_to_hwtid[MAXCPU]; int xlp_hwtid_to_cpuid[MAXCPU]; uint64_t xlp_pic_base; static int xlp_mmuval; extern uint32_t _end; extern char XLPResetEntry[], XLPResetEntryEnd[]; static void xlp_setup_core(void) { uint64_t reg; reg = nlm_mfcr(LSU_DEFEATURE); /* Enable Unaligned and L2HPE */ reg |= (1 << 30) | (1 << 23); /* * Experimental : Enable SUE * Speculative Unmap Enable. Enable speculative L2 cache request for * unmapped access. */ reg |= (1ull << 31); /* Clear S1RCM - A0 errata */ reg &= ~0xeull; nlm_mtcr(LSU_DEFEATURE, reg); reg = nlm_mfcr(SCHED_DEFEATURE); /* Experimental: Disable BRU accepting ALU ops - A0 errata */ reg |= (1 << 24); nlm_mtcr(SCHED_DEFEATURE, reg); } static void xlp_setup_mmu(void) { uint32_t pagegrain; if (nlm_threadid() == 0) { nlm_setup_extended_pagemask(0); nlm_large_variable_tlb_en(1); nlm_extended_tlb_en(1); nlm_mmu_setup(0, 0, 0); } /* Enable no-read, no-exec, large-physical-address */ pagegrain = mips_rd_pagegrain(); pagegrain |= (1U << 31) | /* RIE */ (1 << 30) | /* XIE */ (1 << 29); /* ELPA */ mips_wr_pagegrain(pagegrain); } static void xlp_enable_blocks(void) { uint64_t sysbase; int i; for (i = 0; i < XLP_MAX_NODES; i++) { if (!nlm_dev_exists(XLP_IO_SYS_OFFSET(i))) continue; sysbase = nlm_get_sys_regbase(i); nlm_sys_enable_block(sysbase, DFS_DEVICE_RSA); } } static void xlp_parse_mmu_options(void) { uint64_t sysbase; uint32_t cpu_map = xlp_hw_thread_mask; uint32_t core0_thr_mask, core_thr_mask, cpu_rst_mask; int i, j, k; #ifdef SMP if (cpu_map == 0) cpu_map = 0xffffffff; #else /* Uniprocessor! */ if (cpu_map == 0) cpu_map = 0x1; else if (cpu_map != 0x1) { printf("WARNING: Starting uniprocessor kernel on cpumask [0x%lx]!\n" "WARNING: Other CPUs will be unused.\n", (u_long)cpu_map); cpu_map = 0x1; } #endif xlp_ncores = 1; core0_thr_mask = cpu_map & 0xf; switch (core0_thr_mask) { case 1: xlp_threads_per_core = 1; xlp_mmuval = 0; break; case 3: xlp_threads_per_core = 2; xlp_mmuval = 2; break; case 0xf: xlp_threads_per_core = 4; xlp_mmuval = 3; break; default: goto unsupp; } /* Try to find the enabled cores from SYS block */ sysbase = nlm_get_sys_regbase(0); cpu_rst_mask = nlm_read_sys_reg(sysbase, SYS_CPU_RESET) & 0xff; /* XLP 416 does not report this correctly, fix */ if (nlm_processor_id() == CHIP_PROCESSOR_ID_XLP_416) cpu_rst_mask = 0xe; /* Take out cores which do not exist on chip */ for (i = 1; i < XLP_MAX_CORES; i++) { if ((cpu_rst_mask & (1 << i)) == 0) cpu_map &= ~(0xfu << (4 * i)); } /* Verify other cores' CPU masks */ for (i = 1; i < XLP_MAX_CORES; i++) { core_thr_mask = (cpu_map >> (4 * i)) & 0xf; if (core_thr_mask == 0) continue; if (core_thr_mask != core0_thr_mask) goto unsupp; xlp_ncores++; } xlp_hw_thread_mask = cpu_map; /* setup hardware processor id to cpu id mapping */ for (i = 0; i< MAXCPU; i++) xlp_cpuid_to_hwtid[i] = xlp_hwtid_to_cpuid[i] = -1; for (i = 0, k = 0; i < XLP_MAX_CORES; i++) { if (((cpu_map >> (i * 4)) & 0xf) == 0) continue; for (j = 0; j < xlp_threads_per_core; j++) { xlp_cpuid_to_hwtid[k] = i * 4 + j; xlp_hwtid_to_cpuid[i * 4 + j] = k; k++; } } return; unsupp: printf("ERROR : Unsupported CPU mask [use 1,2 or 4 threads per core].\n" "\tcore0 thread mask [%lx], boot cpu mask [%lx].\n", (u_long)core0_thr_mask, (u_long)cpu_map); panic("Invalid CPU mask - halting.\n"); return; } #ifdef FDT static void xlp_bootargs_init(__register_t arg) { char buf[2048]; /* early stack is big enough */ void *dtbp; phandle_t chosen; ihandle_t mask; dtbp = (void *)(intptr_t)arg; #if defined(FDT_DTB_STATIC) /* * In case the device tree blob was not passed as argument try * to use the statically embedded one. */ if (dtbp == NULL) dtbp = &fdt_static_dtb; #endif if (OF_install(OFW_FDT, 0) == FALSE) while (1); if (OF_init((void *)dtbp) != 0) while (1); OF_interpret("perform-fixup", 0); chosen = OF_finddevice("/chosen"); if (OF_getprop(chosen, "cpumask", &mask, sizeof(mask)) != -1) { xlp_hw_thread_mask = mask; } if (OF_getprop(chosen, "bootargs", buf, sizeof(buf)) != -1) boothowto |= boot_parse_cmdline(buf); } #else /* * arg is a pointer to the environment block, the format of the block is * a=xyz\0b=pqr\0\0 */ static void xlp_bootargs_init(__register_t arg) { char buf[2048]; /* early stack is big enough */ char *p, *v, *n; uint32_t mask; /* * provide backward compat for passing cpu mask as arg */ if (arg & 1) { xlp_hw_thread_mask = arg; return; } p = (void *)(intptr_t)arg; while (*p != '\0') { strlcpy(buf, p, sizeof(buf)); v = buf; n = strsep(&v, "="); if (v == NULL) kern_setenv(n, "1"); else kern_setenv(n, v); p += strlen(p) + 1; } /* CPU mask can be passed thru env */ if (getenv_uint("cpumask", &mask) != 0) xlp_hw_thread_mask = mask; /* command line argument */ v = kern_getenv("bootargs"); if (v != NULL) { strlcpy(buf, v, sizeof(buf)); boothowto |= boot_parse_cmdline(buf); freeenv(v); } } #endif static void mips_init(void) { init_param1(); init_param2(physmem); mips_cpu_init(); cpuinfo.cache_coherent_dma = TRUE; pmap_bootstrap(); mips_proc0_init(); mutex_init(); #ifdef DDB kdb_init(); if (boothowto & RB_KDB) { kdb_enter("Boot flags requested debugger", NULL); } #endif } unsigned int platform_get_timecount(struct timecounter *tc __unused) { uint64_t count = nlm_pic_read_timer(xlp_pic_base, PIC_CLOCK_TIMER); return (unsigned int)~count; } static void xlp_pic_init(void) { struct timecounter pic_timecounter = { platform_get_timecount, /* get_timecount */ 0, /* no poll_pps */ ~0U, /* counter_mask */ XLP_IO_CLK, /* frequency */ "XLRPIC", /* name */ 2000, /* quality (adjusted in code) */ }; int i; int maxirt; xlp_pic_base = nlm_get_pic_regbase(0); /* TOOD: Add other nodes */ maxirt = nlm_read_reg(nlm_get_pic_pcibase(nlm_nodeid()), XLP_PCI_DEVINFO_REG0); printf("Initializing PIC...@%jx %d IRTs\n", (uintmax_t)xlp_pic_base, maxirt); /* Bind all PIC irqs to cpu 0 */ for (i = 0; i < maxirt; i++) nlm_pic_write_irt(xlp_pic_base, i, 0, 0, 1, 0, 1, 0, 0x1); nlm_pic_set_timer(xlp_pic_base, PIC_CLOCK_TIMER, ~0ULL, 0, 0); platform_timecounter = &pic_timecounter; } #if defined(__mips_n32) || defined(__mips_n64) /* PHYSADDR_64_BIT */ #ifdef XLP_SIM #define XLP_MEM_LIM 0x200000000ULL #else #define XLP_MEM_LIM 0x10000000000ULL #endif #else #define XLP_MEM_LIM 0xfffff000UL #endif static vm_paddr_t xlp_mem_excl[] = { 0, 0, /* for kernel image region, see xlp_mem_init */ 0x0c000000, 0x14000000, /* uboot area, cms queue and other stuff */ 0x1fc00000, 0x1fd00000, /* reset vec */ 0x1e000000, 0x1e200000, /* poe buffers */ }; static int mem_exclude_add(vm_paddr_t *avail, vm_paddr_t mstart, vm_paddr_t mend) { int i, pos; pos = 0; for (i = 0; i < nitems(xlp_mem_excl); i += 2) { if (mstart > xlp_mem_excl[i + 1]) continue; if (mstart < xlp_mem_excl[i]) { avail[pos++] = mstart; if (mend < xlp_mem_excl[i]) avail[pos++] = mend; else avail[pos++] = xlp_mem_excl[i]; } mstart = xlp_mem_excl[i + 1]; if (mend <= mstart) break; } if (mstart < mend) { avail[pos++] = mstart; avail[pos++] = mend; } return (pos); } static void xlp_mem_init(void) { vm_paddr_t physsz, tmp; uint64_t bridgebase, base, lim, val; int i, j, k, n; /* update kernel image area in exclude regions */ tmp = (vm_paddr_t)MIPS_KSEG0_TO_PHYS(&_end); tmp = round_page(tmp) + 0x20000; /* round up */ xlp_mem_excl[1] = tmp; printf("Memory (from DRAM BARs):\n"); bridgebase = nlm_get_bridge_regbase(0); /* TODO: Add other nodes */ physsz = 0; for (i = 0, j = 0; i < 8; i++) { val = nlm_read_bridge_reg(bridgebase, BRIDGE_DRAM_BAR(i)); val = (val >> 12) & 0xfffff; base = val << 20; val = nlm_read_bridge_reg(bridgebase, BRIDGE_DRAM_LIMIT(i)); val = (val >> 12) & 0xfffff; if (val == 0) /* BAR not enabled */ continue; lim = (val + 1) << 20; printf(" BAR %d: %#jx - %#jx : ", i, (intmax_t)base, (intmax_t)lim); if (lim <= base) { printf("\tskipped - malformed %#jx -> %#jx\n", (intmax_t)base, (intmax_t)lim); continue; } else if (base >= XLP_MEM_LIM) { printf(" skipped - outside usable limit %#jx.\n", (intmax_t)XLP_MEM_LIM); continue; } else if (lim >= XLP_MEM_LIM) { lim = XLP_MEM_LIM; printf(" truncated to %#jx.\n", (intmax_t)XLP_MEM_LIM); } else printf(" usable\n"); /* exclude unusable regions from BAR and add rest */ n = mem_exclude_add(&phys_avail[j], base, lim); for (k = j; k < j + n; k += 2) { physsz += phys_avail[k + 1] - phys_avail[k]; printf("\tMem[%d]: %#jx - %#jx\n", k/2, (intmax_t)phys_avail[k], (intmax_t)phys_avail[k+1]); } j = k; } /* setup final entry with 0 */ phys_avail[j] = phys_avail[j + 1] = 0; /* copy phys_avail to dump_avail */ for (i = 0; i <= j + 1; i++) dump_avail[i] = phys_avail[i]; realmem = physmem = btoc(physsz); } void platform_start(__register_t a0 __unused, __register_t a1 __unused, __register_t a2 __unused, __register_t a3 __unused) { /* Initialize pcpu stuff */ mips_pcpu0_init(); /* initialize console so that we have printf */ boothowto |= (RB_SERIAL | RB_MULTIPLE); /* Use multiple consoles */ init_static_kenv(boot1_env, sizeof(boot1_env)); xlp_bootargs_init(a0); /* clockrate used by delay, so initialize it here */ xlp_cpu_frequency = xlp_get_cpu_frequency(0, 0); cpu_clock = xlp_cpu_frequency / 1000000; mips_timer_early_init(xlp_cpu_frequency); /* Init console please */ cninit(); /* Early core init and fixes for errata */ xlp_setup_core(); xlp_parse_mmu_options(); xlp_mem_init(); bcopy(XLPResetEntry, (void *)MIPS_RESET_EXC_VEC, XLPResetEntryEnd - XLPResetEntry); #ifdef SMP /* * We will enable the other threads in core 0 here * so that the TLB and cache info is correct when * mips_init runs */ xlp_enable_threads(xlp_mmuval); #endif /* setup for the startup core */ xlp_setup_mmu(); xlp_enable_blocks(); /* Read/Guess/setup board information */ nlm_board_info_setup(); /* MIPS generic init */ mips_init(); /* * XLP specific post initialization * initialize other on chip stuff */ xlp_pic_init(); mips_timer_init_params(xlp_cpu_frequency, 0); } void platform_cpu_init() { } void platform_reset(void) { uint64_t sysbase = nlm_get_sys_regbase(0); nlm_write_sys_reg(sysbase, SYS_CHIP_RESET, 1); for( ; ; ) __asm __volatile("wait"); } #ifdef SMP /* * XLP threads are started simultaneously when we enable threads, this will * ensure that the threads are blocked in platform_init_ap, until they are * ready to proceed to smp_init_secondary() */ static volatile int thr_unblock[4]; int platform_start_ap(int cpuid) { uint32_t coremask, val; uint64_t sysbase = nlm_get_sys_regbase(0); int hwtid = xlp_cpuid_to_hwtid[cpuid]; int core, thr; core = hwtid / 4; thr = hwtid % 4; if (thr == 0) { /* First thread in core, do core wake up */ coremask = 1u << core; /* Enable core clock */ val = nlm_read_sys_reg(sysbase, SYS_CORE_DFS_DIS_CTRL); val &= ~coremask; nlm_write_sys_reg(sysbase, SYS_CORE_DFS_DIS_CTRL, val); /* Remove CPU Reset */ val = nlm_read_sys_reg(sysbase, SYS_CPU_RESET); val &= ~coremask & 0xff; nlm_write_sys_reg(sysbase, SYS_CPU_RESET, val); if (bootverbose) printf("Waking up core %d ...", core); /* Poll for CPU to mark itself coherent */ do { val = nlm_read_sys_reg(sysbase, SYS_CPU_NONCOHERENT_MODE); } while ((val & coremask) != 0); if (bootverbose) printf("Done\n"); } else { /* otherwise release the threads stuck in platform_init_ap */ thr_unblock[thr] = 1; } return (0); } void platform_init_ap(int cpuid) { uint32_t stat; int thr; /* The first thread has to setup the MMU and enable other threads */ thr = nlm_threadid(); if (thr == 0) { xlp_setup_core(); xlp_enable_threads(xlp_mmuval); } else { /* * FIXME busy wait here eats too many cycles, especially * in the core 0 while bootup */ while (thr_unblock[thr] == 0) __asm__ __volatile__ ("nop;nop;nop;nop"); thr_unblock[thr] = 0; } xlp_setup_mmu(); stat = mips_rd_status(); KASSERT((stat & MIPS_SR_INT_IE) == 0, ("Interrupts enabled in %s!", __func__)); stat |= MIPS_SR_COP_2_BIT | MIPS_SR_COP_0_BIT; mips_wr_status(stat); nlm_write_c0_eimr(0ull); xlp_enable_irq(IRQ_IPI); xlp_enable_irq(IRQ_TIMER); xlp_enable_irq(IRQ_MSGRING); return; } int platform_ipi_hardintr_num(void) { return (IRQ_IPI); } int platform_ipi_softintr_num(void) { return (-1); } void platform_ipi_send(int cpuid) { nlm_pic_send_ipi(xlp_pic_base, xlp_cpuid_to_hwtid[cpuid], platform_ipi_hardintr_num(), 0); } void platform_ipi_clear(void) { } int platform_processor_id(void) { return (xlp_hwtid_to_cpuid[nlm_cpuid()]); } void platform_cpu_mask(cpuset_t *mask) { int i, s; CPU_ZERO(mask); s = xlp_ncores * xlp_threads_per_core; for (i = 0; i < s; i++) CPU_SET(i, mask); } struct cpu_group * platform_smp_topo() { return (smp_topo_2level(CG_SHARE_L2, xlp_ncores, CG_SHARE_L1, xlp_threads_per_core, CG_FLAG_THREAD)); } #endif Index: head/sys/powerpc/aim/mmu_oea64.c =================================================================== --- head/sys/powerpc/aim/mmu_oea64.c (revision 366710) +++ head/sys/powerpc/aim/mmu_oea64.c (revision 366711) @@ -1,3232 +1,3233 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008-2015 Nathan Whitehorn * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is also stored by the * logical address mapping module, this module may throw away valid virtual * to physical mappings at almost any time. However, invalidations of * mappings must be done as requested. * * In order to cope with hardware architectures which make virtual to * physical map invalidates expensive, this module may delay invalidate * reduced protection operations until such time as they are actually * necessary. This module is given full information as to which processors * are currently using which maps, and to when physical maps must be made * correct. */ #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmu_oea64.h" void moea64_release_vsid(uint64_t vsid); uintptr_t moea64_get_unique_vsid(void); #define DISABLE_TRANS(msr) msr = mfmsr(); mtmsr(msr & ~PSL_DR) #define ENABLE_TRANS(msr) mtmsr(msr) #define VSID_MAKE(sr, hash) ((sr) | (((hash) & 0xfffff) << 4)) #define VSID_TO_HASH(vsid) (((vsid) >> 4) & 0xfffff) #define VSID_HASH_MASK 0x0000007fffffffffULL /* Get physical address from PVO. */ #define PVO_PADDR(pvo) ((pvo)->pvo_pte.pa & LPTE_RPGN) /* * Locking semantics: * * There are two locks of interest: the page locks and the pmap locks, which * protect their individual PVO lists and are locked in that order. The contents * of all PVO entries are protected by the locks of their respective pmaps. * The pmap of any PVO is guaranteed not to change so long as the PVO is linked * into any list. * */ #define PV_LOCK_COUNT PA_LOCK_COUNT static struct mtx_padalign pv_lock[PV_LOCK_COUNT]; /* * Cheap NUMA-izing of the pv locks, to reduce contention across domains. * NUMA domains on POWER9 appear to be indexed as sparse memory spaces, with the * index at (N << 45). */ #ifdef __powerpc64__ #define PV_LOCK_IDX(pa) ((pa_index(pa) * (((pa) >> 45) + 1)) % PV_LOCK_COUNT) #else #define PV_LOCK_IDX(pa) (pa_index(pa) % PV_LOCK_COUNT) #endif #define PV_LOCKPTR(pa) ((struct mtx *)(&pv_lock[PV_LOCK_IDX(pa)])) #define PV_LOCK(pa) mtx_lock(PV_LOCKPTR(pa)) #define PV_UNLOCK(pa) mtx_unlock(PV_LOCKPTR(pa)) #define PV_LOCKASSERT(pa) mtx_assert(PV_LOCKPTR(pa), MA_OWNED) #define PV_PAGE_LOCK(m) PV_LOCK(VM_PAGE_TO_PHYS(m)) #define PV_PAGE_UNLOCK(m) PV_UNLOCK(VM_PAGE_TO_PHYS(m)) #define PV_PAGE_LOCKASSERT(m) PV_LOCKASSERT(VM_PAGE_TO_PHYS(m)) struct ofw_map { cell_t om_va; cell_t om_len; uint64_t om_pa; cell_t om_mode; }; extern unsigned char _etext[]; extern unsigned char _end[]; extern void *slbtrap, *slbtrapend; /* * Map of physical memory regions. */ static struct mem_region *regions; static struct mem_region *pregions; static struct numa_mem_region *numa_pregions; static u_int phys_avail_count; static int regions_sz, pregions_sz, numapregions_sz; extern void bs_remap_earlyboot(void); /* * Lock for the SLB tables. */ struct mtx moea64_slb_mutex; /* * PTEG data. */ u_long moea64_pteg_count; u_long moea64_pteg_mask; /* * PVO data. */ uma_zone_t moea64_pvo_zone; /* zone for pvo entries */ static struct pvo_entry *moea64_bpvo_pool; static int moea64_bpvo_pool_index = 0; static int moea64_bpvo_pool_size = 0; SYSCTL_INT(_machdep, OID_AUTO, moea64_allocated_bpvo_entries, CTLFLAG_RD, &moea64_bpvo_pool_index, 0, ""); #define BPVO_POOL_SIZE 327680 /* Sensible historical default value */ #define BPVO_POOL_EXPANSION_FACTOR 3 #define VSID_NBPW (sizeof(u_int32_t) * 8) #ifdef __powerpc64__ #define NVSIDS (NPMAPS * 16) #define VSID_HASHMASK 0xffffffffUL #else #define NVSIDS NPMAPS #define VSID_HASHMASK 0xfffffUL #endif static u_int moea64_vsid_bitmap[NVSIDS / VSID_NBPW]; static boolean_t moea64_initialized = FALSE; #ifdef MOEA64_STATS /* * Statistics. */ u_int moea64_pte_valid = 0; u_int moea64_pte_overflow = 0; u_int moea64_pvo_entries = 0; u_int moea64_pvo_enter_calls = 0; u_int moea64_pvo_remove_calls = 0; SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_valid, CTLFLAG_RD, &moea64_pte_valid, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_overflow, CTLFLAG_RD, &moea64_pte_overflow, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_entries, CTLFLAG_RD, &moea64_pvo_entries, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_enter_calls, CTLFLAG_RD, &moea64_pvo_enter_calls, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_remove_calls, CTLFLAG_RD, &moea64_pvo_remove_calls, 0, ""); #endif vm_offset_t moea64_scratchpage_va[2]; struct pvo_entry *moea64_scratchpage_pvo[2]; struct mtx moea64_scratchpage_mtx; uint64_t moea64_large_page_mask = 0; uint64_t moea64_large_page_size = 0; int moea64_large_page_shift = 0; /* * PVO calls. */ static int moea64_pvo_enter(struct pvo_entry *pvo, struct pvo_head *pvo_head, struct pvo_entry **oldpvo); static void moea64_pvo_remove_from_pmap(struct pvo_entry *pvo); static void moea64_pvo_remove_from_page(struct pvo_entry *pvo); static void moea64_pvo_remove_from_page_locked( struct pvo_entry *pvo, vm_page_t m); static struct pvo_entry *moea64_pvo_find_va(pmap_t, vm_offset_t); /* * Utility routines. */ static boolean_t moea64_query_bit(vm_page_t, uint64_t); static u_int moea64_clear_bit(vm_page_t, uint64_t); static void moea64_kremove(vm_offset_t); static void moea64_syncicache(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_size_t sz); static void moea64_pmap_init_qpages(void); /* * Kernel MMU interface */ void moea64_clear_modify(vm_page_t); void moea64_copy_page(vm_page_t, vm_page_t); void moea64_copy_pages(vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize); int moea64_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int flags, int8_t psind); void moea64_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); void moea64_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t); vm_paddr_t moea64_extract(pmap_t, vm_offset_t); vm_page_t moea64_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t); void moea64_init(void); boolean_t moea64_is_modified(vm_page_t); boolean_t moea64_is_prefaultable(pmap_t, vm_offset_t); boolean_t moea64_is_referenced(vm_page_t); int moea64_ts_referenced(vm_page_t); vm_offset_t moea64_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int); boolean_t moea64_page_exists_quick(pmap_t, vm_page_t); void moea64_page_init(vm_page_t); int moea64_page_wired_mappings(vm_page_t); int moea64_pinit(pmap_t); void moea64_pinit0(pmap_t); void moea64_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); void moea64_qenter(vm_offset_t, vm_page_t *, int); void moea64_qremove(vm_offset_t, int); void moea64_release(pmap_t); void moea64_remove(pmap_t, vm_offset_t, vm_offset_t); void moea64_remove_pages(pmap_t); void moea64_remove_all(vm_page_t); void moea64_remove_write(vm_page_t); void moea64_unwire(pmap_t, vm_offset_t, vm_offset_t); void moea64_zero_page(vm_page_t); void moea64_zero_page_area(vm_page_t, int, int); void moea64_activate(struct thread *); void moea64_deactivate(struct thread *); void *moea64_mapdev(vm_paddr_t, vm_size_t); void *moea64_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t); void moea64_unmapdev(vm_offset_t, vm_size_t); vm_paddr_t moea64_kextract(vm_offset_t); void moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma); void moea64_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma); void moea64_kenter(vm_offset_t, vm_paddr_t); boolean_t moea64_dev_direct_mapped(vm_paddr_t, vm_size_t); static void moea64_sync_icache(pmap_t, vm_offset_t, vm_size_t); void moea64_dumpsys_map(vm_paddr_t pa, size_t sz, void **va); void moea64_scan_init(void); vm_offset_t moea64_quick_enter_page(vm_page_t m); void moea64_quick_remove_page(vm_offset_t addr); boolean_t moea64_page_is_mapped(vm_page_t m); static int moea64_map_user_ptr(pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen); static int moea64_decode_kernel_ptr(vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr); static size_t moea64_scan_pmap(void); static void *moea64_dump_pmap_init(unsigned blkpgs); #ifdef __powerpc64__ static void moea64_page_array_startup(long); #endif static int moea64_mincore(pmap_t, vm_offset_t, vm_paddr_t *); static struct pmap_funcs moea64_methods = { .clear_modify = moea64_clear_modify, .copy_page = moea64_copy_page, .copy_pages = moea64_copy_pages, .enter = moea64_enter, .enter_object = moea64_enter_object, .enter_quick = moea64_enter_quick, .extract = moea64_extract, .extract_and_hold = moea64_extract_and_hold, .init = moea64_init, .is_modified = moea64_is_modified, .is_prefaultable = moea64_is_prefaultable, .is_referenced = moea64_is_referenced, .ts_referenced = moea64_ts_referenced, .map = moea64_map, .mincore = moea64_mincore, .page_exists_quick = moea64_page_exists_quick, .page_init = moea64_page_init, .page_wired_mappings = moea64_page_wired_mappings, .pinit = moea64_pinit, .pinit0 = moea64_pinit0, .protect = moea64_protect, .qenter = moea64_qenter, .qremove = moea64_qremove, .release = moea64_release, .remove = moea64_remove, .remove_pages = moea64_remove_pages, .remove_all = moea64_remove_all, .remove_write = moea64_remove_write, .sync_icache = moea64_sync_icache, .unwire = moea64_unwire, .zero_page = moea64_zero_page, .zero_page_area = moea64_zero_page_area, .activate = moea64_activate, .deactivate = moea64_deactivate, .page_set_memattr = moea64_page_set_memattr, .quick_enter_page = moea64_quick_enter_page, .quick_remove_page = moea64_quick_remove_page, .page_is_mapped = moea64_page_is_mapped, #ifdef __powerpc64__ .page_array_startup = moea64_page_array_startup, #endif /* Internal interfaces */ .mapdev = moea64_mapdev, .mapdev_attr = moea64_mapdev_attr, .unmapdev = moea64_unmapdev, .kextract = moea64_kextract, .kenter = moea64_kenter, .kenter_attr = moea64_kenter_attr, .dev_direct_mapped = moea64_dev_direct_mapped, .dumpsys_pa_init = moea64_scan_init, .dumpsys_scan_pmap = moea64_scan_pmap, .dumpsys_dump_pmap_init = moea64_dump_pmap_init, .dumpsys_map_chunk = moea64_dumpsys_map, .map_user_ptr = moea64_map_user_ptr, .decode_kernel_ptr = moea64_decode_kernel_ptr, }; MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods); static struct pvo_head * vm_page_to_pvoh(vm_page_t m) { mtx_assert(PV_LOCKPTR(VM_PAGE_TO_PHYS(m)), MA_OWNED); return (&m->md.mdpg_pvoh); } static struct pvo_entry * alloc_pvo_entry(int bootstrap) { struct pvo_entry *pvo; if (!moea64_initialized || bootstrap) { if (moea64_bpvo_pool_index >= moea64_bpvo_pool_size) { panic("%s: bpvo pool exhausted, index=%d, size=%d, bytes=%zd." "Try setting machdep.moea64_bpvo_pool_size tunable", __func__, moea64_bpvo_pool_index, moea64_bpvo_pool_size, moea64_bpvo_pool_size * sizeof(struct pvo_entry)); } pvo = &moea64_bpvo_pool[ atomic_fetchadd_int(&moea64_bpvo_pool_index, 1)]; bzero(pvo, sizeof(*pvo)); pvo->pvo_vaddr = PVO_BOOTSTRAP; } else pvo = uma_zalloc(moea64_pvo_zone, M_NOWAIT | M_ZERO); return (pvo); } static void init_pvo_entry(struct pvo_entry *pvo, pmap_t pmap, vm_offset_t va) { uint64_t vsid; uint64_t hash; int shift; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pvo->pvo_pmap = pmap; va &= ~ADDR_POFF; pvo->pvo_vaddr |= va; vsid = va_to_vsid(pmap, va); pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT) | (vsid << 16); shift = (pvo->pvo_vaddr & PVO_LARGE) ? moea64_large_page_shift : ADDR_PIDX_SHFT; hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift); pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3; } static void free_pvo_entry(struct pvo_entry *pvo) { if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP)) uma_zfree(moea64_pvo_zone, pvo); } void moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte) { lpte->pte_hi = moea64_pte_vpn_from_pvo_vpn(pvo); lpte->pte_hi |= LPTE_VALID; if (pvo->pvo_vaddr & PVO_LARGE) lpte->pte_hi |= LPTE_BIG; if (pvo->pvo_vaddr & PVO_WIRED) lpte->pte_hi |= LPTE_WIRED; if (pvo->pvo_vaddr & PVO_HID) lpte->pte_hi |= LPTE_HID; lpte->pte_lo = pvo->pvo_pte.pa; /* Includes WIMG bits */ if (pvo->pvo_pte.prot & VM_PROT_WRITE) lpte->pte_lo |= LPTE_BW; else lpte->pte_lo |= LPTE_BR; if (!(pvo->pvo_pte.prot & VM_PROT_EXECUTE)) lpte->pte_lo |= LPTE_NOEXEC; } static __inline uint64_t moea64_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) { uint64_t pte_lo; int i; if (ma != VM_MEMATTR_DEFAULT) { switch (ma) { case VM_MEMATTR_UNCACHEABLE: return (LPTE_I | LPTE_G); case VM_MEMATTR_CACHEABLE: return (LPTE_M); case VM_MEMATTR_WRITE_COMBINING: case VM_MEMATTR_WRITE_BACK: case VM_MEMATTR_PREFETCHABLE: return (LPTE_I); case VM_MEMATTR_WRITE_THROUGH: return (LPTE_W | LPTE_M); } } /* * Assume the page is cache inhibited and access is guarded unless * it's in our available memory array. */ pte_lo = LPTE_I | LPTE_G; for (i = 0; i < pregions_sz; i++) { if ((pa >= pregions[i].mr_start) && (pa < (pregions[i].mr_start + pregions[i].mr_size))) { pte_lo &= ~(LPTE_I | LPTE_G); pte_lo |= LPTE_M; break; } } return pte_lo; } /* * Quick sort callout for comparing memory regions. */ static int om_cmp(const void *a, const void *b); static int om_cmp(const void *a, const void *b) { const struct ofw_map *mapa; const struct ofw_map *mapb; mapa = a; mapb = b; if (mapa->om_pa < mapb->om_pa) return (-1); else if (mapa->om_pa > mapb->om_pa) return (1); else return (0); } static void moea64_add_ofw_mappings(phandle_t mmu, size_t sz) { struct ofw_map translations[sz/(4*sizeof(cell_t))]; /*>= 4 cells per */ pcell_t acells, trans_cells[sz/sizeof(cell_t)]; struct pvo_entry *pvo; register_t msr; vm_offset_t off; vm_paddr_t pa_base; int i, j; bzero(translations, sz); OF_getencprop(OF_finddevice("/"), "#address-cells", &acells, sizeof(acells)); if (OF_getencprop(mmu, "translations", trans_cells, sz) == -1) panic("moea64_bootstrap: can't get ofw translations"); CTR0(KTR_PMAP, "moea64_add_ofw_mappings: translations"); sz /= sizeof(cell_t); for (i = 0, j = 0; i < sz; j++) { translations[j].om_va = trans_cells[i++]; translations[j].om_len = trans_cells[i++]; translations[j].om_pa = trans_cells[i++]; if (acells == 2) { translations[j].om_pa <<= 32; translations[j].om_pa |= trans_cells[i++]; } translations[j].om_mode = trans_cells[i++]; } KASSERT(i == sz, ("Translations map has incorrect cell count (%d/%zd)", i, sz)); sz = j; qsort(translations, sz, sizeof (*translations), om_cmp); for (i = 0; i < sz; i++) { pa_base = translations[i].om_pa; #ifndef __powerpc64__ if ((translations[i].om_pa >> 32) != 0) panic("OFW translations above 32-bit boundary!"); #endif if (pa_base % PAGE_SIZE) panic("OFW translation not page-aligned (phys)!"); if (translations[i].om_va % PAGE_SIZE) panic("OFW translation not page-aligned (virt)!"); CTR3(KTR_PMAP, "translation: pa=%#zx va=%#x len=%#x", pa_base, translations[i].om_va, translations[i].om_len); /* Now enter the pages for this mapping */ DISABLE_TRANS(msr); for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) { /* If this address is direct-mapped, skip remapping */ if (hw_direct_map && translations[i].om_va == PHYS_TO_DMAP(pa_base) && moea64_calc_wimg(pa_base + off, VM_MEMATTR_DEFAULT) == LPTE_M) continue; PMAP_LOCK(kernel_pmap); pvo = moea64_pvo_find_va(kernel_pmap, translations[i].om_va + off); PMAP_UNLOCK(kernel_pmap); if (pvo != NULL) continue; moea64_kenter(translations[i].om_va + off, pa_base + off); } ENABLE_TRANS(msr); } } #ifdef __powerpc64__ static void moea64_probe_large_page(void) { uint16_t pvr = mfpvr() >> 16; switch (pvr) { case IBM970: case IBM970FX: case IBM970MP: powerpc_sync(); isync(); mtspr(SPR_HID4, mfspr(SPR_HID4) & ~HID4_970_DISABLE_LG_PG); powerpc_sync(); isync(); /* FALLTHROUGH */ default: if (moea64_large_page_size == 0) { moea64_large_page_size = 0x1000000; /* 16 MB */ moea64_large_page_shift = 24; } } moea64_large_page_mask = moea64_large_page_size - 1; } static void moea64_bootstrap_slb_prefault(vm_offset_t va, int large) { struct slb *cache; struct slb entry; uint64_t esid, slbe; uint64_t i; cache = PCPU_GET(aim.slb); esid = va >> ADDR_SR_SHFT; slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; for (i = 0; i < 64; i++) { if (cache[i].slbe == (slbe | i)) return; } entry.slbe = slbe; entry.slbv = KERNEL_VSID(esid) << SLBV_VSID_SHIFT; if (large) entry.slbv |= SLBV_L; slb_insert_kernel(entry.slbe, entry.slbv); } #endif static int moea64_kenter_large(vm_offset_t va, vm_paddr_t pa, uint64_t attr, int bootstrap) { struct pvo_entry *pvo; uint64_t pte_lo; int error; pte_lo = LPTE_M; pte_lo |= attr; pvo = alloc_pvo_entry(bootstrap); pvo->pvo_vaddr |= PVO_WIRED | PVO_LARGE; init_pvo_entry(pvo, kernel_pmap, va); pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; pvo->pvo_pte.pa = pa | pte_lo; error = moea64_pvo_enter(pvo, NULL, NULL); if (error != 0) panic("Error %d inserting large page\n", error); return (0); } static void moea64_setup_direct_map(vm_offset_t kernelstart, vm_offset_t kernelend) { register_t msr; vm_paddr_t pa, pkernelstart, pkernelend; vm_offset_t size, off; uint64_t pte_lo; int i; if (moea64_large_page_size == 0) hw_direct_map = 0; DISABLE_TRANS(msr); if (hw_direct_map) { PMAP_LOCK(kernel_pmap); for (i = 0; i < pregions_sz; i++) { for (pa = pregions[i].mr_start; pa < pregions[i].mr_start + pregions[i].mr_size; pa += moea64_large_page_size) { pte_lo = LPTE_M; if (pa & moea64_large_page_mask) { pa &= moea64_large_page_mask; pte_lo |= LPTE_G; } if (pa + moea64_large_page_size > pregions[i].mr_start + pregions[i].mr_size) pte_lo |= LPTE_G; moea64_kenter_large(PHYS_TO_DMAP(pa), pa, pte_lo, 1); } } PMAP_UNLOCK(kernel_pmap); } /* * Make sure the kernel and BPVO pool stay mapped on systems either * without a direct map or on which the kernel is not already executing * out of the direct-mapped region. */ if (kernelstart < DMAP_BASE_ADDRESS) { /* * For pre-dmap execution, we need to use identity mapping * because we will be operating with the mmu on but in the * wrong address configuration until we __restartkernel(). */ for (pa = kernelstart & ~PAGE_MASK; pa < kernelend; pa += PAGE_SIZE) moea64_kenter(pa, pa); } else if (!hw_direct_map) { pkernelstart = kernelstart & ~DMAP_BASE_ADDRESS; pkernelend = kernelend & ~DMAP_BASE_ADDRESS; for (pa = pkernelstart & ~PAGE_MASK; pa < pkernelend; pa += PAGE_SIZE) moea64_kenter(pa | DMAP_BASE_ADDRESS, pa); } if (!hw_direct_map) { size = moea64_bpvo_pool_size*sizeof(struct pvo_entry); off = (vm_offset_t)(moea64_bpvo_pool); for (pa = off; pa < off + size; pa += PAGE_SIZE) moea64_kenter(pa, pa); /* Map exception vectors */ for (pa = EXC_RSVD; pa < EXC_LAST; pa += PAGE_SIZE) moea64_kenter(pa | DMAP_BASE_ADDRESS, pa); } ENABLE_TRANS(msr); /* * Allow user to override unmapped_buf_allowed for testing. * XXXKIB Only direct map implementation was tested. */ if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", &unmapped_buf_allowed)) unmapped_buf_allowed = hw_direct_map; } /* Quick sort callout for comparing physical addresses. */ static int pa_cmp(const void *a, const void *b) { const vm_paddr_t *pa = a, *pb = b; if (*pa < *pb) return (-1); else if (*pa > *pb) return (1); else return (0); } void moea64_early_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend) { int i, j; vm_size_t physsz, hwphyssz; vm_paddr_t kernelphysstart, kernelphysend; int rm_pavail; #ifndef __powerpc64__ /* We don't have a direct map since there is no BAT */ hw_direct_map = 0; /* Make sure battable is zero, since we have no BAT */ for (i = 0; i < 16; i++) { battable[i].batu = 0; battable[i].batl = 0; } #else moea64_probe_large_page(); /* Use a direct map if we have large page support */ if (moea64_large_page_size > 0) hw_direct_map = 1; else hw_direct_map = 0; /* Install trap handlers for SLBs */ bcopy(&slbtrap, (void *)EXC_DSE,(size_t)&slbtrapend - (size_t)&slbtrap); bcopy(&slbtrap, (void *)EXC_ISE,(size_t)&slbtrapend - (size_t)&slbtrap); __syncicache((void *)EXC_DSE, 0x80); __syncicache((void *)EXC_ISE, 0x80); #endif kernelphysstart = kernelstart & ~DMAP_BASE_ADDRESS; kernelphysend = kernelend & ~DMAP_BASE_ADDRESS; /* Get physical memory regions from firmware */ mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); CTR0(KTR_PMAP, "moea64_bootstrap: physical memory"); if (PHYS_AVAIL_ENTRIES < regions_sz) panic("moea64_bootstrap: phys_avail too small"); phys_avail_count = 0; physsz = 0; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); for (i = 0, j = 0; i < regions_sz; i++, j += 2) { CTR3(KTR_PMAP, "region: %#zx - %#zx (%#zx)", regions[i].mr_start, regions[i].mr_start + regions[i].mr_size, regions[i].mr_size); if (hwphyssz != 0 && (physsz + regions[i].mr_size) >= hwphyssz) { if (physsz < hwphyssz) { phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + hwphyssz - physsz; physsz = hwphyssz; phys_avail_count++; dump_avail[j] = phys_avail[j]; dump_avail[j + 1] = phys_avail[j + 1]; } break; } phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; phys_avail_count++; physsz += regions[i].mr_size; dump_avail[j] = phys_avail[j]; dump_avail[j + 1] = phys_avail[j + 1]; } /* Check for overlap with the kernel and exception vectors */ rm_pavail = 0; for (j = 0; j < 2*phys_avail_count; j+=2) { if (phys_avail[j] < EXC_LAST) phys_avail[j] += EXC_LAST; if (phys_avail[j] >= kernelphysstart && phys_avail[j+1] <= kernelphysend) { phys_avail[j] = phys_avail[j+1] = ~0; rm_pavail++; continue; } if (kernelphysstart >= phys_avail[j] && kernelphysstart < phys_avail[j+1]) { if (kernelphysend < phys_avail[j+1]) { phys_avail[2*phys_avail_count] = (kernelphysend & ~PAGE_MASK) + PAGE_SIZE; phys_avail[2*phys_avail_count + 1] = phys_avail[j+1]; phys_avail_count++; } phys_avail[j+1] = kernelphysstart & ~PAGE_MASK; } if (kernelphysend >= phys_avail[j] && kernelphysend < phys_avail[j+1]) { if (kernelphysstart > phys_avail[j]) { phys_avail[2*phys_avail_count] = phys_avail[j]; phys_avail[2*phys_avail_count + 1] = kernelphysstart & ~PAGE_MASK; phys_avail_count++; } phys_avail[j] = (kernelphysend & ~PAGE_MASK) + PAGE_SIZE; } } /* Remove physical available regions marked for removal (~0) */ if (rm_pavail) { qsort(phys_avail, 2*phys_avail_count, sizeof(phys_avail[0]), pa_cmp); phys_avail_count -= rm_pavail; for (i = 2*phys_avail_count; i < 2*(phys_avail_count + rm_pavail); i+=2) phys_avail[i] = phys_avail[i+1] = 0; } physmem = btoc(physsz); #ifdef PTEGCOUNT moea64_pteg_count = PTEGCOUNT; #else moea64_pteg_count = 0x1000; while (moea64_pteg_count < physmem) moea64_pteg_count <<= 1; moea64_pteg_count >>= 1; #endif /* PTEGCOUNT */ } void moea64_mid_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend) { int i; /* * Set PTEG mask */ moea64_pteg_mask = moea64_pteg_count - 1; /* * Initialize SLB table lock and page locks */ mtx_init(&moea64_slb_mutex, "SLB table", NULL, MTX_DEF); for (i = 0; i < PV_LOCK_COUNT; i++) mtx_init(&pv_lock[i], "page pv", NULL, MTX_DEF); /* * Initialise the bootstrap pvo pool. */ TUNABLE_INT_FETCH("machdep.moea64_bpvo_pool_size", &moea64_bpvo_pool_size); if (moea64_bpvo_pool_size == 0) { if (!hw_direct_map) moea64_bpvo_pool_size = ((ptoa((uintmax_t)physmem) * sizeof(struct vm_page)) / (PAGE_SIZE * PAGE_SIZE)) * BPVO_POOL_EXPANSION_FACTOR; else moea64_bpvo_pool_size = BPVO_POOL_SIZE; } if (boothowto & RB_VERBOSE) { printf("mmu_oea64: bpvo pool entries = %d, bpvo pool size = %zu MB\n", moea64_bpvo_pool_size, moea64_bpvo_pool_size*sizeof(struct pvo_entry) / 1048576); } moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc( moea64_bpvo_pool_size*sizeof(struct pvo_entry), PAGE_SIZE); moea64_bpvo_pool_index = 0; /* Place at address usable through the direct map */ if (hw_direct_map) moea64_bpvo_pool = (struct pvo_entry *) PHYS_TO_DMAP((uintptr_t)moea64_bpvo_pool); /* * Make sure kernel vsid is allocated as well as VSID 0. */ #ifndef __powerpc64__ moea64_vsid_bitmap[(KERNEL_VSIDBITS & (NVSIDS - 1)) / VSID_NBPW] |= 1 << (KERNEL_VSIDBITS % VSID_NBPW); moea64_vsid_bitmap[0] |= 1; #endif /* * Initialize the kernel pmap (which is statically allocated). */ #ifdef __powerpc64__ for (i = 0; i < 64; i++) { pcpup->pc_aim.slb[i].slbv = 0; pcpup->pc_aim.slb[i].slbe = 0; } #else for (i = 0; i < 16; i++) kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i; #endif kernel_pmap->pmap_phys = kernel_pmap; CPU_FILL(&kernel_pmap->pm_active); RB_INIT(&kernel_pmap->pmap_pvo); PMAP_LOCK_INIT(kernel_pmap); /* * Now map in all the other buffers we allocated earlier */ moea64_setup_direct_map(kernelstart, kernelend); } void moea64_late_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend) { ihandle_t mmui; phandle_t chosen; phandle_t mmu; ssize_t sz; int i; vm_offset_t pa, va; void *dpcpu; /* * Set up the Open Firmware pmap and add its mappings if not in real * mode. */ chosen = OF_finddevice("/chosen"); if (chosen != -1 && OF_getencprop(chosen, "mmu", &mmui, 4) != -1) { mmu = OF_instance_to_package(mmui); if (mmu == -1 || (sz = OF_getproplen(mmu, "translations")) == -1) sz = 0; if (sz > 6144 /* tmpstksz - 2 KB headroom */) panic("moea64_bootstrap: too many ofw translations"); if (sz > 0) moea64_add_ofw_mappings(mmu, sz); } /* * Calculate the last available physical address. */ Maxmem = 0; for (i = 0; phys_avail[i + 2] != 0; i += 2) Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1])); /* * Initialize MMU. */ pmap_cpu_bootstrap(0); mtmsr(mfmsr() | PSL_DR | PSL_IR); pmap_bootstrapped++; /* * Set the start and end of kva. */ virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; /* * Map the entire KVA range into the SLB. We must not fault there. */ #ifdef __powerpc64__ for (va = virtual_avail; va < virtual_end; va += SEGMENT_LENGTH) moea64_bootstrap_slb_prefault(va, 0); #endif /* * Remap any early IO mappings (console framebuffer, etc.) */ bs_remap_earlyboot(); /* * Figure out how far we can extend virtual_end into segment 16 * without running into existing mappings. Segment 16 is guaranteed * to contain neither RAM nor devices (at least on Apple hardware), * but will generally contain some OFW mappings we should not * step on. */ #ifndef __powerpc64__ /* KVA is in high memory on PPC64 */ PMAP_LOCK(kernel_pmap); while (virtual_end < VM_MAX_KERNEL_ADDRESS && moea64_pvo_find_va(kernel_pmap, virtual_end+1) == NULL) virtual_end += PAGE_SIZE; PMAP_UNLOCK(kernel_pmap); #endif /* * Allocate a kernel stack with a guard page for thread0 and map it * into the kernel page map. */ pa = moea64_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE); va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; virtual_avail = va + kstack_pages * PAGE_SIZE; CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); thread0.td_kstack = va; thread0.td_kstack_pages = kstack_pages; for (i = 0; i < kstack_pages; i++) { moea64_kenter(va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the message buffer. */ pa = msgbuf_phys = moea64_bootstrap_alloc(msgbufsize, PAGE_SIZE); msgbufp = (struct msgbuf *)virtual_avail; va = virtual_avail; virtual_avail += round_page(msgbufsize); while (va < virtual_avail) { moea64_kenter(va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the dynamic percpu area. */ pa = moea64_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE); dpcpu = (void *)virtual_avail; va = virtual_avail; virtual_avail += DPCPU_SIZE; while (va < virtual_avail) { moea64_kenter(va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } dpcpu_init(dpcpu, curcpu); crashdumpmap = (caddr_t)virtual_avail; virtual_avail += MAXDUMPPGS * PAGE_SIZE; /* * Allocate some things for page zeroing. We put this directly * in the page table and use MOEA64_PTE_REPLACE to avoid any * of the PVO book-keeping or other parts of the VM system * from even knowing that this hack exists. */ if (!hw_direct_map) { mtx_init(&moea64_scratchpage_mtx, "pvo zero page", NULL, MTX_DEF); for (i = 0; i < 2; i++) { moea64_scratchpage_va[i] = (virtual_end+1) - PAGE_SIZE; virtual_end -= PAGE_SIZE; moea64_kenter(moea64_scratchpage_va[i], 0); PMAP_LOCK(kernel_pmap); moea64_scratchpage_pvo[i] = moea64_pvo_find_va( kernel_pmap, (vm_offset_t)moea64_scratchpage_va[i]); PMAP_UNLOCK(kernel_pmap); } } numa_mem_regions(&numa_pregions, &numapregions_sz); } static void moea64_pmap_init_qpages(void) { struct pcpu *pc; int i; if (hw_direct_map) return; CPU_FOREACH(i) { pc = pcpu_find(i); pc->pc_qmap_addr = kva_alloc(PAGE_SIZE); if (pc->pc_qmap_addr == 0) panic("pmap_init_qpages: unable to allocate KVA"); PMAP_LOCK(kernel_pmap); pc->pc_aim.qmap_pvo = moea64_pvo_find_va(kernel_pmap, pc->pc_qmap_addr); PMAP_UNLOCK(kernel_pmap); mtx_init(&pc->pc_aim.qmap_lock, "qmap lock", NULL, MTX_DEF); } } SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, moea64_pmap_init_qpages, NULL); /* * Activate a user pmap. This mostly involves setting some non-CPU * state. */ void moea64_activate(struct thread *td) { pmap_t pm; pm = &td->td_proc->p_vmspace->vm_pmap; CPU_SET(PCPU_GET(cpuid), &pm->pm_active); #ifdef __powerpc64__ PCPU_SET(aim.userslb, pm->pm_slb); __asm __volatile("slbmte %0, %1; isync" :: "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE)); #else PCPU_SET(curpmap, pm->pmap_phys); mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid); #endif } void moea64_deactivate(struct thread *td) { pmap_t pm; __asm __volatile("isync; slbie %0" :: "r"(USER_ADDR)); pm = &td->td_proc->p_vmspace->vm_pmap; CPU_CLR(PCPU_GET(cpuid), &pm->pm_active); #ifdef __powerpc64__ PCPU_SET(aim.userslb, NULL); #else PCPU_SET(curpmap, NULL); #endif } void moea64_unwire(pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry key, *pvo; vm_page_t m; int64_t refchg; key.pvo_vaddr = sva; PMAP_LOCK(pm); for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { if ((pvo->pvo_vaddr & PVO_WIRED) == 0) panic("moea64_unwire: pvo %p is missing PVO_WIRED", pvo); pvo->pvo_vaddr &= ~PVO_WIRED; refchg = moea64_pte_replace(pvo, 0 /* No invalidation */); if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { if (refchg < 0) refchg = LPTE_CHG; m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); refchg |= atomic_readandclear_32(&m->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(m); if (refchg & LPTE_REF) vm_page_aflag_set(m, PGA_REFERENCED); } pm->pm_stats.wired_count--; } PMAP_UNLOCK(pm); } static int moea64_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { struct pvo_entry *pvo; vm_paddr_t pa; vm_page_t m; int val; bool managed; PMAP_LOCK(pmap); /* XXX Add support for superpages */ pvo = moea64_pvo_find_va(pmap, addr); if (pvo != NULL) { pa = PVO_PADDR(pvo); m = PHYS_TO_VM_PAGE(pa); managed = (pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED; val = MINCORE_INCORE; } else { PMAP_UNLOCK(pmap); return (0); } PMAP_UNLOCK(pmap); if (m == NULL) return (0); if (managed) { if (moea64_is_modified(m)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if (moea64_is_referenced(m)) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { *pap = pa; } return (val); } /* * This goes through and sets the physical address of our * special scratch PTE to the PA we want to zero or copy. Because * of locking issues (this can get called in pvo_enter() by * the UMA allocator), we can't use most other utility functions here */ static __inline void moea64_set_scratchpage_pa(int which, vm_paddr_t pa) { struct pvo_entry *pvo; KASSERT(!hw_direct_map, ("Using OEA64 scratchpage with a direct map!")); mtx_assert(&moea64_scratchpage_mtx, MA_OWNED); pvo = moea64_scratchpage_pvo[which]; PMAP_LOCK(pvo->pvo_pmap); pvo->pvo_pte.pa = moea64_calc_wimg(pa, VM_MEMATTR_DEFAULT) | (uint64_t)pa; moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE); PMAP_UNLOCK(pvo->pvo_pmap); isync(); } void moea64_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t dst; vm_offset_t src; dst = VM_PAGE_TO_PHYS(mdst); src = VM_PAGE_TO_PHYS(msrc); if (hw_direct_map) { bcopy((void *)PHYS_TO_DMAP(src), (void *)PHYS_TO_DMAP(dst), PAGE_SIZE); } else { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(0, src); moea64_set_scratchpage_pa(1, dst); bcopy((void *)moea64_scratchpage_va[0], (void *)moea64_scratchpage_va[1], PAGE_SIZE); mtx_unlock(&moea64_scratchpage_mtx); } } static inline void moea64_copy_pages_dmap(vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); a_cp = (char *)(uintptr_t)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); b_cp = (char *)(uintptr_t)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } static inline void moea64_copy_pages_nodmap(vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; mtx_lock(&moea64_scratchpage_mtx); while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); moea64_set_scratchpage_pa(0, VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])); a_cp = (char *)moea64_scratchpage_va[0] + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); moea64_set_scratchpage_pa(1, VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])); b_cp = (char *)moea64_scratchpage_va[1] + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } mtx_unlock(&moea64_scratchpage_mtx); } void moea64_copy_pages(vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { if (hw_direct_map) { moea64_copy_pages_dmap(ma, a_offset, mb, b_offset, xfersize); } else { moea64_copy_pages_nodmap(ma, a_offset, mb, b_offset, xfersize); } } void moea64_zero_page_area(vm_page_t m, int off, int size) { vm_paddr_t pa = VM_PAGE_TO_PHYS(m); if (size + off > PAGE_SIZE) panic("moea64_zero_page: size + off > PAGE_SIZE"); if (hw_direct_map) { bzero((caddr_t)(uintptr_t)PHYS_TO_DMAP(pa) + off, size); } else { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(0, pa); bzero((caddr_t)moea64_scratchpage_va[0] + off, size); mtx_unlock(&moea64_scratchpage_mtx); } } /* * Zero a page of physical memory by temporarily mapping it */ void moea64_zero_page(vm_page_t m) { vm_paddr_t pa = VM_PAGE_TO_PHYS(m); vm_offset_t va, off; if (!hw_direct_map) { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(0, pa); va = moea64_scratchpage_va[0]; } else { va = PHYS_TO_DMAP(pa); } for (off = 0; off < PAGE_SIZE; off += cacheline_size) __asm __volatile("dcbz 0,%0" :: "r"(va + off)); if (!hw_direct_map) mtx_unlock(&moea64_scratchpage_mtx); } vm_offset_t moea64_quick_enter_page(vm_page_t m) { struct pvo_entry *pvo; vm_paddr_t pa = VM_PAGE_TO_PHYS(m); if (hw_direct_map) return (PHYS_TO_DMAP(pa)); /* * MOEA64_PTE_REPLACE does some locking, so we can't just grab * a critical section and access the PCPU data like on i386. * Instead, pin the thread and grab the PCPU lock to prevent * a preempting thread from using the same PCPU data. */ sched_pin(); mtx_assert(PCPU_PTR(aim.qmap_lock), MA_NOTOWNED); pvo = PCPU_GET(aim.qmap_pvo); mtx_lock(PCPU_PTR(aim.qmap_lock)); pvo->pvo_pte.pa = moea64_calc_wimg(pa, pmap_page_get_memattr(m)) | (uint64_t)pa; moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE); isync(); return (PCPU_GET(qmap_addr)); } void moea64_quick_remove_page(vm_offset_t addr) { if (hw_direct_map) return; mtx_assert(PCPU_PTR(aim.qmap_lock), MA_OWNED); KASSERT(PCPU_GET(qmap_addr) == addr, ("moea64_quick_remove_page: invalid address")); mtx_unlock(PCPU_PTR(aim.qmap_lock)); sched_unpin(); } boolean_t moea64_page_is_mapped(vm_page_t m) { return (!LIST_EMPTY(&(m)->md.mdpg_pvoh)); } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. */ int moea64_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct pvo_entry *pvo, *oldpvo; struct pvo_head *pvo_head; uint64_t pte_lo; int error; if ((m->oflags & VPO_UNMANAGED) == 0) { if ((flags & PMAP_ENTER_QUICK_LOCKED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); else VM_OBJECT_ASSERT_LOCKED(m->object); } pvo = alloc_pvo_entry(0); if (pvo == NULL) return (KERN_RESOURCE_SHORTAGE); pvo->pvo_pmap = NULL; /* to be filled in later */ pvo->pvo_pte.prot = prot; pte_lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m)); pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | pte_lo; if ((flags & PMAP_ENTER_WIRED) != 0) pvo->pvo_vaddr |= PVO_WIRED; if ((m->oflags & VPO_UNMANAGED) != 0 || !moea64_initialized) { pvo_head = NULL; } else { pvo_head = &m->md.mdpg_pvoh; pvo->pvo_vaddr |= PVO_MANAGED; } PV_PAGE_LOCK(m); PMAP_LOCK(pmap); if (pvo->pvo_pmap == NULL) init_pvo_entry(pvo, pmap, va); if (prot & VM_PROT_WRITE) if (pmap_bootstrapped && (m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); error = moea64_pvo_enter(pvo, pvo_head, &oldpvo); if (error == EEXIST) { if (oldpvo->pvo_vaddr == pvo->pvo_vaddr && oldpvo->pvo_pte.pa == pvo->pvo_pte.pa && oldpvo->pvo_pte.prot == prot) { /* Identical mapping already exists */ error = 0; /* If not in page table, reinsert it */ if (moea64_pte_synch(oldpvo) < 0) { STAT_MOEA64(moea64_pte_overflow--); moea64_pte_insert(oldpvo); } /* Then just clean up and go home */ PV_PAGE_UNLOCK(m); PMAP_UNLOCK(pmap); free_pvo_entry(pvo); goto out; } else { /* Otherwise, need to kill it first */ KASSERT(oldpvo->pvo_pmap == pmap, ("pmap of old " "mapping does not match new mapping")); moea64_pvo_remove_from_pmap(oldpvo); moea64_pvo_enter(pvo, pvo_head, NULL); } } PMAP_UNLOCK(pmap); PV_PAGE_UNLOCK(m); /* Free any dead pages */ if (error == EEXIST) { moea64_pvo_remove_from_page(oldpvo); free_pvo_entry(oldpvo); } out: /* * Flush the page from the instruction cache if this page is * mapped executable and cacheable. */ if (pmap != kernel_pmap && (m->a.flags & PGA_EXECUTABLE) == 0 && (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { vm_page_aflag_set(m, PGA_EXECUTABLE); moea64_syncicache(pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE); } return (KERN_SUCCESS); } static void moea64_syncicache(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_size_t sz) { /* * This is much trickier than on older systems because * we can't sync the icache on physical addresses directly * without a direct map. Instead we check a couple of cases * where the memory is already mapped in and, failing that, * use the same trick we use for page zeroing to create * a temporary mapping for this physical address. */ if (!pmap_bootstrapped) { /* * If PMAP is not bootstrapped, we are likely to be * in real mode. */ __syncicache((void *)(uintptr_t)pa, sz); } else if (pmap == kernel_pmap) { __syncicache((void *)va, sz); } else if (hw_direct_map) { __syncicache((void *)(uintptr_t)PHYS_TO_DMAP(pa), sz); } else { /* Use the scratch page to set up a temp mapping */ mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(1, pa & ~ADDR_POFF); __syncicache((void *)(moea64_scratchpage_va[1] + (va & ADDR_POFF)), sz); mtx_unlock(&moea64_scratchpage_mtx); } } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void moea64_enter_object(pmap_t pm, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { moea64_enter(pm, start + ptoa(diff), m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, 0); m = TAILQ_NEXT(m, listq); } } void moea64_enter_quick(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot) { moea64_enter(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, 0); } vm_paddr_t moea64_extract(pmap_t pm, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; PMAP_LOCK(pm); pvo = moea64_pvo_find_va(pm, va); if (pvo == NULL) pa = 0; else pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(pm); return (pa); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. */ vm_page_t moea64_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { struct pvo_entry *pvo; vm_page_t m; m = NULL; PMAP_LOCK(pmap); pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); if (pvo != NULL && (pvo->pvo_pte.prot & prot) == prot) { m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); if (!vm_page_wire_mapped(m)) m = NULL; } PMAP_UNLOCK(pmap); return (m); } static void * moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait) { struct pvo_entry *pvo; vm_offset_t va; vm_page_t m; int needed_lock; /* * This entire routine is a horrible hack to avoid bothering kmem * for new KVA addresses. Because this can get called from inside * kmem allocation routines, calling kmem for a new address here * can lead to multiply locking non-recursive mutexes. */ *flags = UMA_SLAB_PRIV; needed_lock = !PMAP_LOCKED(kernel_pmap); m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); if (m == NULL) return (NULL); va = VM_PAGE_TO_PHYS(m); pvo = alloc_pvo_entry(1 /* bootstrap */); pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE; pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | LPTE_M; if (needed_lock) PMAP_LOCK(kernel_pmap); init_pvo_entry(pvo, kernel_pmap, va); pvo->pvo_vaddr |= PVO_WIRED; moea64_pvo_enter(pvo, NULL, NULL); if (needed_lock) PMAP_UNLOCK(kernel_pmap); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero((void *)va, PAGE_SIZE); return (void *)va; } extern int elf32_nxstack; void moea64_init() { CTR0(KTR_PMAP, "moea64_init"); moea64_pvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); if (!hw_direct_map) { uma_zone_set_allocf(moea64_pvo_zone, moea64_uma_page_alloc); } #ifdef COMPAT_FREEBSD32 elf32_nxstack = 1; #endif moea64_initialized = TRUE; } boolean_t moea64_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_referenced: page %p is not managed", m)); return (moea64_query_bit(m, LPTE_REF)); } boolean_t moea64_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_modified: page %p is not managed", m)); /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); return (moea64_query_bit(m, LPTE_CHG)); } boolean_t moea64_is_prefaultable(pmap_t pmap, vm_offset_t va) { struct pvo_entry *pvo; boolean_t rv = TRUE; PMAP_LOCK(pmap); pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); if (pvo != NULL) rv = FALSE; PMAP_UNLOCK(pmap); return (rv); } void moea64_clear_modify(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; moea64_clear_bit(m, LPTE_CHG); } /* * Clear the write and modified bits in each of the given page's mappings. */ void moea64_remove_write(vm_page_t m) { struct pvo_entry *pvo; int64_t refchg, ret; pmap_t pmap; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return powerpc_sync(); PV_PAGE_LOCK(m); refchg = 0; LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { pvo->pvo_pte.prot &= ~VM_PROT_WRITE; ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE); if (ret < 0) ret = LPTE_CHG; refchg |= ret; if (pvo->pvo_pmap == kernel_pmap) isync(); } PMAP_UNLOCK(pmap); } if ((refchg | atomic_readandclear_32(&m->md.mdpg_attrs)) & LPTE_CHG) vm_page_dirty(m); vm_page_aflag_clear(m, PGA_WRITEABLE); PV_PAGE_UNLOCK(m); } /* * moea64_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int moea64_ts_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_ts_referenced: page %p is not managed", m)); return (moea64_clear_bit(m, LPTE_REF)); } /* * Modify the WIMG settings of all mappings for a page. */ void moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma) { struct pvo_entry *pvo; int64_t refchg; pmap_t pmap; uint64_t lo; if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; } lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), ma); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) { pvo->pvo_pte.pa &= ~LPTE_WIMG; pvo->pvo_pte.pa |= lo; refchg = moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE); if (refchg < 0) refchg = (pvo->pvo_pte.prot & VM_PROT_WRITE) ? LPTE_CHG : 0; if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { refchg |= atomic_readandclear_32(&m->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(m); if (refchg & LPTE_REF) vm_page_aflag_set(m, PGA_REFERENCED); } if (pvo->pvo_pmap == kernel_pmap) isync(); } PMAP_UNLOCK(pmap); } m->md.mdpg_cache_attrs = ma; PV_PAGE_UNLOCK(m); } /* * Map a wired page into kernel virtual address space. */ void moea64_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { int error; struct pvo_entry *pvo, *oldpvo; do { pvo = alloc_pvo_entry(0); if (pvo == NULL) vm_wait(NULL); } while (pvo == NULL); pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; pvo->pvo_pte.pa = (pa & ~ADDR_POFF) | moea64_calc_wimg(pa, ma); pvo->pvo_vaddr |= PVO_WIRED; PMAP_LOCK(kernel_pmap); oldpvo = moea64_pvo_find_va(kernel_pmap, va); if (oldpvo != NULL) moea64_pvo_remove_from_pmap(oldpvo); init_pvo_entry(pvo, kernel_pmap, va); error = moea64_pvo_enter(pvo, NULL, NULL); PMAP_UNLOCK(kernel_pmap); /* Free any dead pages */ if (oldpvo != NULL) { moea64_pvo_remove_from_page(oldpvo); free_pvo_entry(oldpvo); } if (error != 0) panic("moea64_kenter: failed to enter va %#zx pa %#jx: %d", va, (uintmax_t)pa, error); } void moea64_kenter(vm_offset_t va, vm_paddr_t pa) { moea64_kenter_attr(va, pa, VM_MEMATTR_DEFAULT); } /* * Extract the physical page address associated with the given kernel virtual * address. */ vm_paddr_t moea64_kextract(vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; /* * Shortcut the direct-mapped case when applicable. We never put * anything but 1:1 (or 62-bit aliased) mappings below * VM_MIN_KERNEL_ADDRESS. */ if (va < VM_MIN_KERNEL_ADDRESS) return (va & ~DMAP_BASE_ADDRESS); PMAP_LOCK(kernel_pmap); pvo = moea64_pvo_find_va(kernel_pmap, va); KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR, va)); pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(kernel_pmap); return (pa); } /* * Remove a wired page from kernel virtual address space. */ void moea64_kremove(vm_offset_t va) { moea64_remove(kernel_pmap, va, va + PAGE_SIZE); } /* * Provide a kernel pointer corresponding to a given userland pointer. * The returned pointer is valid until the next time this function is * called in this thread. This is used internally in copyin/copyout. */ static int moea64_map_user_ptr(pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen) { size_t l; #ifdef __powerpc64__ struct slb *slb; #endif register_t slbv; *kaddr = (char *)USER_ADDR + ((uintptr_t)uaddr & ~SEGMENT_MASK); l = ((char *)USER_ADDR + SEGMENT_LENGTH) - (char *)(*kaddr); if (l > ulen) l = ulen; if (klen) *klen = l; else if (l != ulen) return (EFAULT); #ifdef __powerpc64__ /* Try lockless look-up first */ slb = user_va_to_slb_entry(pm, (vm_offset_t)uaddr); if (slb == NULL) { /* If it isn't there, we need to pre-fault the VSID */ PMAP_LOCK(pm); slbv = va_to_vsid(pm, (vm_offset_t)uaddr) << SLBV_VSID_SHIFT; PMAP_UNLOCK(pm); } else { slbv = slb->slbv; } /* Mark segment no-execute */ slbv |= SLBV_N; #else slbv = va_to_vsid(pm, (vm_offset_t)uaddr); /* Mark segment no-execute */ slbv |= SR_N; #endif /* If we have already set this VSID, we can just return */ if (curthread->td_pcb->pcb_cpu.aim.usr_vsid == slbv) return (0); __asm __volatile("isync"); curthread->td_pcb->pcb_cpu.aim.usr_segm = (uintptr_t)uaddr >> ADDR_SR_SHFT; curthread->td_pcb->pcb_cpu.aim.usr_vsid = slbv; #ifdef __powerpc64__ __asm __volatile ("slbie %0; slbmte %1, %2; isync" :: "r"(USER_ADDR), "r"(slbv), "r"(USER_SLB_SLBE)); #else __asm __volatile("mtsr %0,%1; isync" :: "n"(USER_SR), "r"(slbv)); #endif return (0); } /* * Figure out where a given kernel pointer (usually in a fault) points * to from the VM's perspective, potentially remapping into userland's * address space. */ static int moea64_decode_kernel_ptr(vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr) { vm_offset_t user_sr; if ((addr >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) { user_sr = curthread->td_pcb->pcb_cpu.aim.usr_segm; addr &= ADDR_PIDX | ADDR_POFF; addr |= user_sr << ADDR_SR_SHFT; *decoded_addr = addr; *is_user = 1; } else { *decoded_addr = addr; *is_user = 0; } return (0); } /* * Map a range of physical addresses into kernel virtual address space. * * The value passed in *virt is a suggested virtual address for the mapping. * Architectures which can support a direct-mapped physical to virtual region * can return the appropriate address within that region, leaving '*virt' * unchanged. Other architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped region. */ vm_offset_t moea64_map(vm_offset_t *virt, vm_paddr_t pa_start, vm_paddr_t pa_end, int prot) { vm_offset_t sva, va; if (hw_direct_map) { /* * Check if every page in the region is covered by the direct * map. The direct map covers all of physical memory. Use * moea64_calc_wimg() as a shortcut to see if the page is in * physical memory as a way to see if the direct map covers it. */ for (va = pa_start; va < pa_end; va += PAGE_SIZE) if (moea64_calc_wimg(va, VM_MEMATTR_DEFAULT) != LPTE_M) break; if (va == pa_end) return (PHYS_TO_DMAP(pa_start)); } sva = *virt; va = sva; /* XXX respect prot argument */ for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE) moea64_kenter(va, pa_start); *virt = va; return (sva); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t moea64_page_exists_quick(pmap_t pmap, vm_page_t m) { int loops; struct pvo_entry *pvo; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_page_exists_quick: page %p is not managed", m)); loops = 0; rv = FALSE; PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { if (!(pvo->pvo_vaddr & PVO_DEAD) && pvo->pvo_pmap == pmap) { rv = TRUE; break; } if (++loops >= 16) break; } PV_PAGE_UNLOCK(m); return (rv); } void moea64_page_init(vm_page_t m) { m->md.mdpg_attrs = 0; m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; LIST_INIT(&m->md.mdpg_pvoh); } /* * Return the number of managed mappings to the given physical page * that are wired. */ int moea64_page_wired_mappings(vm_page_t m) { struct pvo_entry *pvo; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) if ((pvo->pvo_vaddr & (PVO_DEAD | PVO_WIRED)) == PVO_WIRED) count++; PV_PAGE_UNLOCK(m); return (count); } static uintptr_t moea64_vsidcontext; uintptr_t moea64_get_unique_vsid(void) { u_int entropy; register_t hash; uint32_t mask; int i; entropy = 0; __asm __volatile("mftb %0" : "=r"(entropy)); mtx_lock(&moea64_slb_mutex); for (i = 0; i < NVSIDS; i += VSID_NBPW) { u_int n; /* * Create a new value by mutiplying by a prime and adding in * entropy from the timebase register. This is to make the * VSID more random so that the PT hash function collides * less often. (Note that the prime casues gcc to do shifts * instead of a multiply.) */ moea64_vsidcontext = (moea64_vsidcontext * 0x1105) + entropy; hash = moea64_vsidcontext & (NVSIDS - 1); if (hash == 0) /* 0 is special, avoid it */ continue; n = hash >> 5; mask = 1 << (hash & (VSID_NBPW - 1)); hash = (moea64_vsidcontext & VSID_HASHMASK); if (moea64_vsid_bitmap[n] & mask) { /* collision? */ /* anything free in this bucket? */ if (moea64_vsid_bitmap[n] == 0xffffffff) { entropy = (moea64_vsidcontext >> 20); continue; } i = ffs(~moea64_vsid_bitmap[n]) - 1; mask = 1 << i; hash &= rounddown2(VSID_HASHMASK, VSID_NBPW); hash |= i; } if (hash == VSID_VRMA) /* also special, avoid this too */ continue; KASSERT(!(moea64_vsid_bitmap[n] & mask), ("Allocating in-use VSID %#zx\n", hash)); moea64_vsid_bitmap[n] |= mask; mtx_unlock(&moea64_slb_mutex); return (hash); } mtx_unlock(&moea64_slb_mutex); panic("%s: out of segments",__func__); } #ifdef __powerpc64__ int moea64_pinit(pmap_t pmap) { RB_INIT(&pmap->pmap_pvo); pmap->pm_slb_tree_root = slb_alloc_tree(); pmap->pm_slb = slb_alloc_user_cache(); pmap->pm_slb_len = 0; return (1); } #else int moea64_pinit(pmap_t pmap) { int i; uint32_t hash; RB_INIT(&pmap->pmap_pvo); if (pmap_bootstrapped) pmap->pmap_phys = (pmap_t)moea64_kextract((vm_offset_t)pmap); else pmap->pmap_phys = pmap; /* * Allocate some segment registers for this pmap. */ hash = moea64_get_unique_vsid(); for (i = 0; i < 16; i++) pmap->pm_sr[i] = VSID_MAKE(i, hash); KASSERT(pmap->pm_sr[0] != 0, ("moea64_pinit: pm_sr[0] = 0")); return (1); } #endif /* * Initialize the pmap associated with process 0. */ void moea64_pinit0(pmap_t pm) { PMAP_LOCK_INIT(pm); moea64_pinit(pm); bzero(&pm->pm_stats, sizeof(pm->pm_stats)); } /* * Set the physical protection on the specified range of this map as requested. */ static void moea64_pvo_protect( pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot) { struct vm_page *pg; vm_prot_t oldprot; int32_t refchg; PMAP_LOCK_ASSERT(pm, MA_OWNED); /* * Change the protection of the page. */ oldprot = pvo->pvo_pte.prot; pvo->pvo_pte.prot = prot; pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); /* * If the PVO is in the page table, update mapping */ refchg = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE); if (refchg < 0) refchg = (oldprot & VM_PROT_WRITE) ? LPTE_CHG : 0; if (pm != kernel_pmap && pg != NULL && (pg->a.flags & PGA_EXECUTABLE) == 0 && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { if ((pg->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(pg, PGA_EXECUTABLE); moea64_syncicache(pm, PVO_VADDR(pvo), PVO_PADDR(pvo), PAGE_SIZE); } /* * Update vm about the REF/CHG bits if the page is managed and we have * removed write access. */ if (pg != NULL && (pvo->pvo_vaddr & PVO_MANAGED) && (oldprot & VM_PROT_WRITE)) { refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(pg); if (refchg & LPTE_REF) vm_page_aflag_set(pg, PGA_REFERENCED); } } void moea64_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { struct pvo_entry *pvo, *tpvo, key; CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm, sva, eva, prot); KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap, ("moea64_protect: non current pmap")); if ((prot & VM_PROT_READ) == VM_PROT_NONE) { moea64_remove(pm, sva, eva); return; } PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); moea64_pvo_protect(pm, pvo, prot); } PMAP_UNLOCK(pm); } /* * Map a list of wired pages into kernel virtual address space. This is * intended for temporary mappings which do not need page modification or * references recorded. Existing mappings in the region are overwritten. */ void moea64_qenter(vm_offset_t va, vm_page_t *m, int count) { while (count-- > 0) { moea64_kenter(va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } } /* * Remove page mappings from kernel virtual address space. Intended for * temporary mappings entered by moea64_qenter. */ void moea64_qremove(vm_offset_t va, int count) { while (count-- > 0) { moea64_kremove(va); va += PAGE_SIZE; } } void moea64_release_vsid(uint64_t vsid) { int idx, mask; mtx_lock(&moea64_slb_mutex); idx = vsid & (NVSIDS-1); mask = 1 << (idx % VSID_NBPW); idx /= VSID_NBPW; KASSERT(moea64_vsid_bitmap[idx] & mask, ("Freeing unallocated VSID %#jx", vsid)); moea64_vsid_bitmap[idx] &= ~mask; mtx_unlock(&moea64_slb_mutex); } void moea64_release(pmap_t pmap) { /* * Free segment registers' VSIDs */ #ifdef __powerpc64__ slb_free_tree(pmap); slb_free_user_cache(pmap->pm_slb); #else KASSERT(pmap->pm_sr[0] != 0, ("moea64_release: pm_sr[0] = 0")); moea64_release_vsid(VSID_TO_HASH(pmap->pm_sr[0])); #endif } /* * Remove all pages mapped by the specified pmap */ void moea64_remove_pages(pmap_t pm) { struct pvo_entry *pvo, *tpvo; struct pvo_dlist tofree; SLIST_INIT(&tofree); PMAP_LOCK(pm); RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) { if (pvo->pvo_vaddr & PVO_WIRED) continue; /* * For locking reasons, remove this from the page table and * pmap, but save delinking from the vm_page for a second * pass */ moea64_pvo_remove_from_pmap(pvo); SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink); } PMAP_UNLOCK(pm); while (!SLIST_EMPTY(&tofree)) { pvo = SLIST_FIRST(&tofree); SLIST_REMOVE_HEAD(&tofree, pvo_dlink); moea64_pvo_remove_from_page(pvo); free_pvo_entry(pvo); } } /* * Remove the given range of addresses from the specified map. */ void moea64_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry *pvo, *tpvo, key; struct pvo_dlist tofree; /* * Perform an unsynchronized read. This is, however, safe. */ if (pm->pm_stats.resident_count == 0) return; key.pvo_vaddr = sva; SLIST_INIT(&tofree); PMAP_LOCK(pm); for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); /* * For locking reasons, remove this from the page table and * pmap, but save delinking from the vm_page for a second * pass */ moea64_pvo_remove_from_pmap(pvo); SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink); } PMAP_UNLOCK(pm); while (!SLIST_EMPTY(&tofree)) { pvo = SLIST_FIRST(&tofree); SLIST_REMOVE_HEAD(&tofree, pvo_dlink); moea64_pvo_remove_from_page(pvo); free_pvo_entry(pvo); } } /* * Remove physical page from all pmaps in which it resides. moea64_pvo_remove() * will reflect changes in pte's back to the vm_page. */ void moea64_remove_all(vm_page_t m) { struct pvo_entry *pvo, *next_pvo; struct pvo_head freequeue; int wasdead; pmap_t pmap; LIST_INIT(&freequeue); PV_PAGE_LOCK(m); LIST_FOREACH_SAFE(pvo, vm_page_to_pvoh(m), pvo_vlink, next_pvo) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); wasdead = (pvo->pvo_vaddr & PVO_DEAD); if (!wasdead) moea64_pvo_remove_from_pmap(pvo); moea64_pvo_remove_from_page_locked(pvo, m); if (!wasdead) LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink); PMAP_UNLOCK(pmap); } KASSERT(!pmap_page_is_mapped(m), ("Page still has mappings")); KASSERT((m->a.flags & PGA_WRITEABLE) == 0, ("Page still writable")); PV_PAGE_UNLOCK(m); /* Clean up UMA allocations */ LIST_FOREACH_SAFE(pvo, &freequeue, pvo_vlink, next_pvo) free_pvo_entry(pvo); } /* * Allocate a physical page of memory directly from the phys_avail map. * Can only be called from moea64_bootstrap before avail start and end are * calculated. */ vm_offset_t moea64_bootstrap_alloc(vm_size_t size, vm_size_t align) { vm_offset_t s, e; int i, j; size = round_page(size); for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (align != 0) s = roundup2(phys_avail[i], align); else s = phys_avail[i]; e = s + size; if (s < phys_avail[i] || e > phys_avail[i + 1]) continue; if (s + size > platform_real_maxaddr()) continue; if (s == phys_avail[i]) { phys_avail[i] += size; } else if (e == phys_avail[i + 1]) { phys_avail[i + 1] -= size; } else { for (j = phys_avail_count * 2; j > i; j -= 2) { phys_avail[j] = phys_avail[j - 2]; phys_avail[j + 1] = phys_avail[j - 1]; } phys_avail[i + 3] = phys_avail[i + 1]; phys_avail[i + 1] = s; phys_avail[i + 2] = e; phys_avail_count++; } return (s); } panic("moea64_bootstrap_alloc: could not allocate memory"); } static int moea64_pvo_enter(struct pvo_entry *pvo, struct pvo_head *pvo_head, struct pvo_entry **oldpvop) { struct pvo_entry *old_pvo; int err; PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); STAT_MOEA64(moea64_pvo_enter_calls++); /* * Add to pmap list */ old_pvo = RB_INSERT(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); if (old_pvo != NULL) { if (oldpvop != NULL) *oldpvop = old_pvo; return (EEXIST); } if (pvo_head != NULL) { LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink); } if (pvo->pvo_vaddr & PVO_WIRED) pvo->pvo_pmap->pm_stats.wired_count++; pvo->pvo_pmap->pm_stats.resident_count++; /* * Insert it into the hardware page table */ err = moea64_pte_insert(pvo); if (err != 0) { panic("moea64_pvo_enter: overflow"); } STAT_MOEA64(moea64_pvo_entries++); if (pvo->pvo_pmap == kernel_pmap) isync(); #ifdef __powerpc64__ /* * Make sure all our bootstrap mappings are in the SLB as soon * as virtual memory is switched on. */ if (!pmap_bootstrapped) moea64_bootstrap_slb_prefault(PVO_VADDR(pvo), pvo->pvo_vaddr & PVO_LARGE); #endif return (0); } static void moea64_pvo_remove_from_pmap(struct pvo_entry *pvo) { struct vm_page *pg; int32_t refchg; KASSERT(pvo->pvo_pmap != NULL, ("Trying to remove PVO with no pmap")); PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); KASSERT(!(pvo->pvo_vaddr & PVO_DEAD), ("Trying to remove dead PVO")); /* * If there is an active pte entry, we need to deactivate it */ refchg = moea64_pte_unset(pvo); if (refchg < 0) { /* * If it was evicted from the page table, be pessimistic and * dirty the page. */ if (pvo->pvo_pte.prot & VM_PROT_WRITE) refchg = LPTE_CHG; else refchg = 0; } /* * Update our statistics. */ pvo->pvo_pmap->pm_stats.resident_count--; if (pvo->pvo_vaddr & PVO_WIRED) pvo->pvo_pmap->pm_stats.wired_count--; /* * Remove this PVO from the pmap list. */ RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); /* * Mark this for the next sweep */ pvo->pvo_vaddr |= PVO_DEAD; /* Send RC bits to VM */ if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); if (pg != NULL) { refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(pg); if (refchg & LPTE_REF) vm_page_aflag_set(pg, PGA_REFERENCED); } } } static inline void moea64_pvo_remove_from_page_locked(struct pvo_entry *pvo, vm_page_t m) { KASSERT(pvo->pvo_vaddr & PVO_DEAD, ("Trying to delink live page")); /* Use NULL pmaps as a sentinel for races in page deletion */ if (pvo->pvo_pmap == NULL) return; pvo->pvo_pmap = NULL; /* * Update vm about page writeability/executability if managed */ PV_LOCKASSERT(PVO_PADDR(pvo)); if (pvo->pvo_vaddr & PVO_MANAGED) { if (m != NULL) { LIST_REMOVE(pvo, pvo_vlink); if (LIST_EMPTY(vm_page_to_pvoh(m))) vm_page_aflag_clear(m, PGA_WRITEABLE | PGA_EXECUTABLE); } } STAT_MOEA64(moea64_pvo_entries--); STAT_MOEA64(moea64_pvo_remove_calls++); } static void moea64_pvo_remove_from_page(struct pvo_entry *pvo) { vm_page_t pg = NULL; if (pvo->pvo_vaddr & PVO_MANAGED) pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); PV_LOCK(PVO_PADDR(pvo)); moea64_pvo_remove_from_page_locked(pvo, pg); PV_UNLOCK(PVO_PADDR(pvo)); } static struct pvo_entry * moea64_pvo_find_va(pmap_t pm, vm_offset_t va) { struct pvo_entry key; PMAP_LOCK_ASSERT(pm, MA_OWNED); key.pvo_vaddr = va & ~ADDR_POFF; return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key)); } static boolean_t moea64_query_bit(vm_page_t m, uint64_t ptebit) { struct pvo_entry *pvo; int64_t ret; boolean_t rv; /* * See if this bit is stored in the page already. */ if (m->md.mdpg_attrs & ptebit) return (TRUE); /* * Examine each PTE. Sync so that any pending REF/CHG bits are * flushed to the PTEs. */ rv = FALSE; powerpc_sync(); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { ret = 0; /* * See if this pvo has a valid PTE. if so, fetch the * REF/CHG bits from the valid PTE. If the appropriate * ptebit is set, return success. */ PMAP_LOCK(pvo->pvo_pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) ret = moea64_pte_synch(pvo); PMAP_UNLOCK(pvo->pvo_pmap); if (ret > 0) { atomic_set_32(&m->md.mdpg_attrs, ret & (LPTE_CHG | LPTE_REF)); if (ret & ptebit) { rv = TRUE; break; } } } PV_PAGE_UNLOCK(m); return (rv); } static u_int moea64_clear_bit(vm_page_t m, u_int64_t ptebit) { u_int count; struct pvo_entry *pvo; int64_t ret; /* * Sync so that any pending REF/CHG bits are flushed to the PTEs (so * we can reset the right ones). */ powerpc_sync(); /* * For each pvo entry, clear the pte's ptebit. */ count = 0; PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { ret = 0; PMAP_LOCK(pvo->pvo_pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) ret = moea64_pte_clear(pvo, ptebit); PMAP_UNLOCK(pvo->pvo_pmap); if (ret > 0 && (ret & ptebit)) count++; } atomic_clear_32(&m->md.mdpg_attrs, ptebit); PV_PAGE_UNLOCK(m); return (count); } boolean_t moea64_dev_direct_mapped(vm_paddr_t pa, vm_size_t size) { struct pvo_entry *pvo, key; vm_offset_t ppa; int error = 0; if (hw_direct_map && mem_valid(pa, size) == 0) return (0); PMAP_LOCK(kernel_pmap); ppa = pa & ~ADDR_POFF; key.pvo_vaddr = DMAP_BASE_ADDRESS + ppa; for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key); ppa < pa + size; ppa += PAGE_SIZE, pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) { if (pvo == NULL || PVO_PADDR(pvo) != ppa) { error = EFAULT; break; } } PMAP_UNLOCK(kernel_pmap); return (error); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * moea64_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { vm_offset_t va, tmpva, ppa, offset; ppa = trunc_page(pa); offset = pa & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); va = kva_alloc(size); if (!va) panic("moea64_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0;) { moea64_kenter_attr(tmpva, ppa, ma); size -= PAGE_SIZE; tmpva += PAGE_SIZE; ppa += PAGE_SIZE; } return ((void *)(va + offset)); } void * moea64_mapdev(vm_paddr_t pa, vm_size_t size) { return moea64_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT); } void moea64_unmapdev(vm_offset_t va, vm_size_t size) { vm_offset_t base, offset; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); moea64_qremove(base, atop(size)); kva_free(base, size); } void moea64_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { struct pvo_entry *pvo; vm_offset_t lim; vm_paddr_t pa; vm_size_t len; if (__predict_false(pm == NULL)) pm = &curthread->td_proc->p_vmspace->vm_pmap; PMAP_LOCK(pm); while (sz > 0) { lim = round_page(va+1); len = MIN(lim - va, sz); pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF); if (pvo != NULL && !(pvo->pvo_pte.pa & LPTE_I)) { pa = PVO_PADDR(pvo) | (va & ADDR_POFF); moea64_syncicache(pm, va, pa, len); } va += len; sz -= len; } PMAP_UNLOCK(pm); } void moea64_dumpsys_map(vm_paddr_t pa, size_t sz, void **va) { *va = (void *)(uintptr_t)pa; } extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1]; void moea64_scan_init() { struct pvo_entry *pvo; vm_offset_t va; int i; if (!do_minidump) { /* Initialize phys. segments for dumpsys(). */ memset(&dump_map, 0, sizeof(dump_map)); mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); for (i = 0; i < pregions_sz; i++) { dump_map[i].pa_start = pregions[i].mr_start; dump_map[i].pa_size = pregions[i].mr_size; } return; } /* Virtual segments for minidumps: */ memset(&dump_map, 0, sizeof(dump_map)); /* 1st: kernel .data and .bss. */ dump_map[0].pa_start = trunc_page((uintptr_t)_etext); dump_map[0].pa_size = round_page((uintptr_t)_end) - dump_map[0].pa_start; /* 2nd: msgbuf and tables (see pmap_bootstrap()). */ dump_map[1].pa_start = (vm_paddr_t)(uintptr_t)msgbufp->msg_ptr; dump_map[1].pa_size = round_page(msgbufp->msg_size); /* 3rd: kernel VM. */ va = dump_map[1].pa_start + dump_map[1].pa_size; /* Find start of next chunk (from va). */ while (va < virtual_end) { /* Don't dump the buffer cache. */ if (va >= kmi.buffer_sva && va < kmi.buffer_eva) { va = kmi.buffer_eva; continue; } pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD)) break; va += PAGE_SIZE; } if (va < virtual_end) { dump_map[2].pa_start = va; va += PAGE_SIZE; /* Find last page in chunk. */ while (va < virtual_end) { /* Don't run into the buffer cache. */ if (va == kmi.buffer_sva) break; pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD)) break; va += PAGE_SIZE; } dump_map[2].pa_size = va - dump_map[2].pa_start; } } #ifdef __powerpc64__ static size_t moea64_scan_pmap() { struct pvo_entry *pvo; vm_paddr_t pa, pa_end; vm_offset_t va, pgva, kstart, kend, kstart_lp, kend_lp; uint64_t lpsize; lpsize = moea64_large_page_size; kstart = trunc_page((vm_offset_t)_etext); kend = round_page((vm_offset_t)_end); kstart_lp = kstart & ~moea64_large_page_mask; kend_lp = (kend + moea64_large_page_mask) & ~moea64_large_page_mask; CTR4(KTR_PMAP, "moea64_scan_pmap: kstart=0x%016lx, kend=0x%016lx, " "kstart_lp=0x%016lx, kend_lp=0x%016lx", kstart, kend, kstart_lp, kend_lp); PMAP_LOCK(kernel_pmap); RB_FOREACH(pvo, pvo_tree, &kernel_pmap->pmap_pvo) { va = pvo->pvo_vaddr; if (va & PVO_DEAD) continue; /* Skip DMAP (except kernel area) */ if (va >= DMAP_BASE_ADDRESS && va <= DMAP_MAX_ADDRESS) { if (va & PVO_LARGE) { pgva = va & ~moea64_large_page_mask; if (pgva < kstart_lp || pgva >= kend_lp) continue; } else { pgva = trunc_page(va); if (pgva < kstart || pgva >= kend) continue; } } pa = PVO_PADDR(pvo); if (va & PVO_LARGE) { pa_end = pa + lpsize; for (; pa < pa_end; pa += PAGE_SIZE) { if (is_dumpable(pa)) dump_add_page(pa); } } else { if (is_dumpable(pa)) dump_add_page(pa); } } PMAP_UNLOCK(kernel_pmap); return (sizeof(struct lpte) * moea64_pteg_count * 8); } static struct dump_context dump_ctx; static void * moea64_dump_pmap_init(unsigned blkpgs) { dump_ctx.ptex = 0; dump_ctx.ptex_end = moea64_pteg_count * 8; dump_ctx.blksz = blkpgs * PAGE_SIZE; return (&dump_ctx); } #else static size_t moea64_scan_pmap() { return (0); } static void * moea64_dump_pmap_init(unsigned blkpgs) { return (NULL); } #endif #ifdef __powerpc64__ static void moea64_map_range(vm_offset_t va, vm_paddr_t pa, vm_size_t npages) { for (; npages > 0; --npages) { if (moea64_large_page_size != 0 && (pa & moea64_large_page_mask) == 0 && (va & moea64_large_page_mask) == 0 && npages >= (moea64_large_page_size >> PAGE_SHIFT)) { PMAP_LOCK(kernel_pmap); moea64_kenter_large(va, pa, 0, 0); PMAP_UNLOCK(kernel_pmap); pa += moea64_large_page_size; va += moea64_large_page_size; npages -= (moea64_large_page_size >> PAGE_SHIFT) - 1; } else { moea64_kenter(va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } } } static void moea64_page_array_startup(long pages) { long dom_pages[MAXMEMDOM]; vm_paddr_t pa; vm_offset_t va, vm_page_base; vm_size_t needed, size; long page; int domain; int i; vm_page_base = 0xd000000000000000ULL; /* Short-circuit single-domain systems. */ if (vm_ndomains == 1) { size = round_page(pages * sizeof(struct vm_page)); pa = vm_phys_early_alloc(0, size); vm_page_base = moea64_map(&vm_page_base, pa, pa + size, VM_PROT_READ | VM_PROT_WRITE); vm_page_array_size = pages; vm_page_array = (vm_page_t)vm_page_base; return; } page = 0; for (i = 0; i < MAXMEMDOM; i++) dom_pages[i] = 0; /* Now get the number of pages required per domain. */ for (i = 0; i < vm_phys_nsegs; i++) { domain = vm_phys_segs[i].domain; KASSERT(domain < MAXMEMDOM, ("Invalid vm_phys_segs NUMA domain %d!\n", domain)); /* Get size of vm_page_array needed for this segment. */ size = btoc(vm_phys_segs[i].end - vm_phys_segs[i].start); dom_pages[domain] += size; } for (i = 0; phys_avail[i + 1] != 0; i+= 2) { domain = _vm_phys_domain(phys_avail[i]); KASSERT(domain < MAXMEMDOM, ("Invalid phys_avail NUMA domain %d!\n", domain)); size = btoc(phys_avail[i + 1] - phys_avail[i]); dom_pages[domain] += size; } /* * Map in chunks that can get us all 16MB pages. There will be some * overlap between domains, but that's acceptable for now. */ vm_page_array_size = 0; va = vm_page_base; for (i = 0; i < MAXMEMDOM && vm_page_array_size < pages; i++) { if (dom_pages[i] == 0) continue; size = ulmin(pages - vm_page_array_size, dom_pages[i]); size = round_page(size * sizeof(struct vm_page)); needed = size; size = roundup2(size, moea64_large_page_size); pa = vm_phys_early_alloc(i, size); vm_page_array_size += size / sizeof(struct vm_page); moea64_map_range(va, pa, size >> PAGE_SHIFT); /* Scoot up domain 0, to reduce the domain page overlap. */ if (i == 0) vm_page_base += size - needed; va += size; } vm_page_array = (vm_page_t)vm_page_base; vm_page_array_size = pages; } #endif static int64_t moea64_null_method(void) { return (0); } static int64_t moea64_pte_replace_default(struct pvo_entry *pvo, int flags) { int64_t refchg; refchg = moea64_pte_unset(pvo); moea64_pte_insert(pvo); return (refchg); } struct moea64_funcs *moea64_ops; #define DEFINE_OEA64_IFUNC(ret, func, args, def) \ DEFINE_IFUNC(, ret, moea64_##func, args) { \ moea64_##func##_t f; \ if (moea64_ops == NULL) \ return ((moea64_##func##_t)def); \ f = moea64_ops->func; \ return (f != NULL ? f : (moea64_##func##_t)def);\ } DEFINE_OEA64_IFUNC(int64_t, pte_replace, (struct pvo_entry *, int), moea64_pte_replace_default) DEFINE_OEA64_IFUNC(int64_t, pte_insert, (struct pvo_entry *), moea64_null_method) DEFINE_OEA64_IFUNC(int64_t, pte_unset, (struct pvo_entry *), moea64_null_method) DEFINE_OEA64_IFUNC(int64_t, pte_clear, (struct pvo_entry *, uint64_t), moea64_null_method) DEFINE_OEA64_IFUNC(int64_t, pte_synch, (struct pvo_entry *), moea64_null_method) Index: head/sys/powerpc/aim/mmu_radix.c =================================================================== --- head/sys/powerpc/aim/mmu_radix.c (revision 366710) +++ head/sys/powerpc/aim/mmu_radix.c (revision 366711) @@ -1,6390 +1,6391 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2018 Matthew Macy * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INVARIANTS #include #endif #define PPC_BITLSHIFT(bit) (sizeof(long)*NBBY - 1 - (bit)) #define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit)) #define PPC_BITLSHIFT_VAL(val, bit) ((val) << PPC_BITLSHIFT(bit)) #include "opt_ddb.h" #ifdef DDB static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va); #endif #define PG_W RPTE_WIRED #define PG_V RPTE_VALID #define PG_MANAGED RPTE_MANAGED #define PG_PROMOTED RPTE_PROMOTED #define PG_M RPTE_C #define PG_A RPTE_R #define PG_X RPTE_EAA_X #define PG_RW RPTE_EAA_W #define PG_PTE_CACHE RPTE_ATTR_MASK #define RPTE_SHIFT 9 #define NLS_MASK ((1UL<<5)-1) #define RPTE_ENTRIES (1UL<> L3_PAGE_SIZE_SHIFT); } static __inline vm_pindex_t pmap_pml3e_index(vm_offset_t va) { return ((va >> L3_PAGE_SIZE_SHIFT) & RPTE_MASK); } static __inline vm_pindex_t pmap_pml2e_index(vm_offset_t va) { return ((va >> L2_PAGE_SIZE_SHIFT) & RPTE_MASK); } static __inline vm_pindex_t pmap_pml1e_index(vm_offset_t va) { return ((va & PG_FRAME) >> L1_PAGE_SIZE_SHIFT); } /* Return various clipped indexes for a given VA */ static __inline vm_pindex_t pmap_pte_index(vm_offset_t va) { return ((va >> PAGE_SHIFT) & RPTE_MASK); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_l3e_to_pte(pt_entry_t *l3e, vm_offset_t va) { pt_entry_t *pte; vm_paddr_t ptepa; ptepa = (*l3e & NLB_MASK); pte = (pt_entry_t *)PHYS_TO_DMAP(ptepa); return (&pte[pmap_pte_index(va)]); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pt_entry_t * pmap_l2e_to_l3e(pt_entry_t *l2e, vm_offset_t va) { pt_entry_t *l3e; vm_paddr_t l3pa; l3pa = (*l2e & NLB_MASK); l3e = (pml3_entry_t *)PHYS_TO_DMAP(l3pa); return (&l3e[pmap_pml3e_index(va)]); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pt_entry_t * pmap_l1e_to_l2e(pt_entry_t *l1e, vm_offset_t va) { pt_entry_t *l2e; vm_paddr_t l2pa; l2pa = (*l1e & NLB_MASK); l2e = (pml2_entry_t *)PHYS_TO_DMAP(l2pa); return (&l2e[pmap_pml2e_index(va)]); } static __inline pml1_entry_t * pmap_pml1e(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_pml1[pmap_pml1e_index(va)]); } static pt_entry_t * pmap_pml2e(pmap_t pmap, vm_offset_t va) { pt_entry_t *l1e; l1e = pmap_pml1e(pmap, va); if (l1e == NULL || (*l1e & RPTE_VALID) == 0) return (NULL); return (pmap_l1e_to_l2e(l1e, va)); } static __inline pt_entry_t * pmap_pml3e(pmap_t pmap, vm_offset_t va) { pt_entry_t *l2e; l2e = pmap_pml2e(pmap, va); if (l2e == NULL || (*l2e & RPTE_VALID) == 0) return (NULL); return (pmap_l2e_to_l3e(l2e, va)); } static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pt_entry_t *l3e; l3e = pmap_pml3e(pmap, va); if (l3e == NULL || (*l3e & RPTE_VALID) == 0) return (NULL); return (pmap_l3e_to_pte(l3e, va)); } int nkpt = 64; SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, "Number of kernel page table pages allocated on bootup"); vm_paddr_t dmaplimit; SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); #ifdef INVARIANTS #define VERBOSE_PMAP 0 #define VERBOSE_PROTECT 0 static int pmap_logging; SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_logging, CTLFLAG_RWTUN, &pmap_logging, 0, "verbose debug logging"); #endif static u_int64_t KPTphys; /* phys addr of kernel level 1 */ //static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ static vm_offset_t qframe = 0; static struct mtx qframe_mtx; void mmu_radix_activate(struct thread *); void mmu_radix_advise(pmap_t, vm_offset_t, vm_offset_t, int); void mmu_radix_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t); void mmu_radix_clear_modify(vm_page_t); void mmu_radix_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); int mmu_radix_decode_kernel_ptr(vm_offset_t, int *, vm_offset_t *); int mmu_radix_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t); void mmu_radix_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); void mmu_radix_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t); vm_paddr_t mmu_radix_extract(pmap_t pmap, vm_offset_t va); vm_page_t mmu_radix_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t); void mmu_radix_kenter(vm_offset_t, vm_paddr_t); vm_paddr_t mmu_radix_kextract(vm_offset_t); void mmu_radix_kremove(vm_offset_t); boolean_t mmu_radix_is_modified(vm_page_t); boolean_t mmu_radix_is_prefaultable(pmap_t, vm_offset_t); boolean_t mmu_radix_is_referenced(vm_page_t); void mmu_radix_object_init_pt(pmap_t, vm_offset_t, vm_object_t, vm_pindex_t, vm_size_t); boolean_t mmu_radix_page_exists_quick(pmap_t, vm_page_t); void mmu_radix_page_init(vm_page_t); boolean_t mmu_radix_page_is_mapped(vm_page_t m); void mmu_radix_page_set_memattr(vm_page_t, vm_memattr_t); int mmu_radix_page_wired_mappings(vm_page_t); int mmu_radix_pinit(pmap_t); void mmu_radix_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); bool mmu_radix_ps_enabled(pmap_t); void mmu_radix_qenter(vm_offset_t, vm_page_t *, int); void mmu_radix_qremove(vm_offset_t, int); vm_offset_t mmu_radix_quick_enter_page(vm_page_t); void mmu_radix_quick_remove_page(vm_offset_t); boolean_t mmu_radix_ts_referenced(vm_page_t); void mmu_radix_release(pmap_t); void mmu_radix_remove(pmap_t, vm_offset_t, vm_offset_t); void mmu_radix_remove_all(vm_page_t); void mmu_radix_remove_pages(pmap_t); void mmu_radix_remove_write(vm_page_t); void mmu_radix_unwire(pmap_t, vm_offset_t, vm_offset_t); void mmu_radix_zero_page(vm_page_t); void mmu_radix_zero_page_area(vm_page_t, int, int); int mmu_radix_change_attr(vm_offset_t, vm_size_t, vm_memattr_t); void mmu_radix_page_array_startup(long pages); #include "mmu_oea64.h" /* * Kernel MMU interface */ static void mmu_radix_bootstrap(vm_offset_t, vm_offset_t); static void mmu_radix_copy_page(vm_page_t, vm_page_t); static void mmu_radix_copy_pages(vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize); static void mmu_radix_growkernel(vm_offset_t); static void mmu_radix_init(void); static int mmu_radix_mincore(pmap_t, vm_offset_t, vm_paddr_t *); static vm_offset_t mmu_radix_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int); static void mmu_radix_pinit0(pmap_t); static void *mmu_radix_mapdev(vm_paddr_t, vm_size_t); static void *mmu_radix_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t); static void mmu_radix_unmapdev(vm_offset_t, vm_size_t); static void mmu_radix_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma); static boolean_t mmu_radix_dev_direct_mapped(vm_paddr_t, vm_size_t); static void mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, void **va); static void mmu_radix_scan_init(void); static void mmu_radix_cpu_bootstrap(int ap); static void mmu_radix_tlbie_all(void); static struct pmap_funcs mmu_radix_methods = { .bootstrap = mmu_radix_bootstrap, .copy_page = mmu_radix_copy_page, .copy_pages = mmu_radix_copy_pages, .cpu_bootstrap = mmu_radix_cpu_bootstrap, .growkernel = mmu_radix_growkernel, .init = mmu_radix_init, .map = mmu_radix_map, .mincore = mmu_radix_mincore, .pinit = mmu_radix_pinit, .pinit0 = mmu_radix_pinit0, .mapdev = mmu_radix_mapdev, .mapdev_attr = mmu_radix_mapdev_attr, .unmapdev = mmu_radix_unmapdev, .kenter_attr = mmu_radix_kenter_attr, .dev_direct_mapped = mmu_radix_dev_direct_mapped, .dumpsys_pa_init = mmu_radix_scan_init, .dumpsys_map_chunk = mmu_radix_dumpsys_map, .page_is_mapped = mmu_radix_page_is_mapped, .ps_enabled = mmu_radix_ps_enabled, .object_init_pt = mmu_radix_object_init_pt, .protect = mmu_radix_protect, /* pmap dispatcher interface */ .clear_modify = mmu_radix_clear_modify, .copy = mmu_radix_copy, .enter = mmu_radix_enter, .enter_object = mmu_radix_enter_object, .enter_quick = mmu_radix_enter_quick, .extract = mmu_radix_extract, .extract_and_hold = mmu_radix_extract_and_hold, .is_modified = mmu_radix_is_modified, .is_prefaultable = mmu_radix_is_prefaultable, .is_referenced = mmu_radix_is_referenced, .ts_referenced = mmu_radix_ts_referenced, .page_exists_quick = mmu_radix_page_exists_quick, .page_init = mmu_radix_page_init, .page_wired_mappings = mmu_radix_page_wired_mappings, .qenter = mmu_radix_qenter, .qremove = mmu_radix_qremove, .release = mmu_radix_release, .remove = mmu_radix_remove, .remove_all = mmu_radix_remove_all, .remove_write = mmu_radix_remove_write, .unwire = mmu_radix_unwire, .zero_page = mmu_radix_zero_page, .zero_page_area = mmu_radix_zero_page_area, .activate = mmu_radix_activate, .quick_enter_page = mmu_radix_quick_enter_page, .quick_remove_page = mmu_radix_quick_remove_page, .page_set_memattr = mmu_radix_page_set_memattr, .page_array_startup = mmu_radix_page_array_startup, /* Internal interfaces */ .kenter = mmu_radix_kenter, .kextract = mmu_radix_kextract, .kremove = mmu_radix_kremove, .change_attr = mmu_radix_change_attr, .decode_kernel_ptr = mmu_radix_decode_kernel_ptr, .tlbie_all = mmu_radix_tlbie_all, }; MMU_DEF(mmu_radix, MMU_TYPE_RADIX, mmu_radix_methods); static boolean_t pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, struct rwlock **lockp); static boolean_t pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va); static int pmap_unuse_pt(pmap_t, vm_offset_t, pml3_entry_t, struct spglist *); static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *pde, struct spglist *free); static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp); static bool pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e, u_int flags, struct rwlock **lockp); #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); #endif static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate); static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp); static int pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void free_pv_chunk(struct pv_chunk *pc); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start); static void pmap_invalidate_all(pmap_t pmap); static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush); /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ #define UNIMPLEMENTED() panic("%s not implemented", __func__) #define UNTESTED() panic("%s not yet tested", __func__) /* Number of supported PID bits */ static unsigned int isa3_pid_bits; /* PID to start allocating from */ static unsigned int isa3_base_pid; #define PROCTAB_SIZE_SHIFT (isa3_pid_bits + 4) #define PROCTAB_ENTRIES (1ul << isa3_pid_bits) /* * Map of physical memory regions. */ static struct mem_region *regions, *pregions; static struct numa_mem_region *numa_pregions; static u_int phys_avail_count; static int regions_sz, pregions_sz, numa_pregions_sz; static struct pate *isa3_parttab; static struct prte *isa3_proctab; static vmem_t *asid_arena; extern void bs_remap_earlyboot(void); #define RADIX_PGD_SIZE_SHIFT 16 #define RADIX_PGD_SIZE (1UL << RADIX_PGD_SIZE_SHIFT) #define RADIX_PGD_INDEX_SHIFT (RADIX_PGD_SIZE_SHIFT-3) #define NL2EPG (PAGE_SIZE/sizeof(pml2_entry_t)) #define NL3EPG (PAGE_SIZE/sizeof(pml3_entry_t)) #define NUPML1E (RADIX_PGD_SIZE/sizeof(uint64_t)) /* number of userland PML1 pages */ #define NUPDPE (NUPML1E * NL2EPG)/* number of userland PDP pages */ #define NUPDE (NUPDPE * NL3EPG) /* number of userland PD entries */ /* POWER9 only permits a 64k partition table size. */ #define PARTTAB_SIZE_SHIFT 16 #define PARTTAB_SIZE (1UL << PARTTAB_SIZE_SHIFT) #define PARTTAB_HR (1UL << 63) /* host uses radix */ #define PARTTAB_GR (1UL << 63) /* guest uses radix must match host */ /* TLB flush actions. Used as argument to tlbiel_all() */ enum { TLB_INVAL_SCOPE_LPID = 0, /* invalidate TLBs for current LPID */ TLB_INVAL_SCOPE_GLOBAL = 1, /* invalidate all TLBs */ }; #define NPV_LIST_LOCKS MAXCPU static int pmap_initialized; static vm_paddr_t proctab0pa; static vm_paddr_t parttab_phys; CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); /* * Data for the pv entry allocation mechanism. * Updates to pv_invl_gen are protected by the pv_list_locks[] * elements, but reads are not. */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx __exclusive_cache_line pv_chunks_mutex; static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_radix_index(pa) ((pa) >> L3_PAGE_SIZE_SHIFT) #define pa_to_pvh(pa) (&pv_table[pa_radix_index(pa)]) #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_radix_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) /* * We support 52 bits, hence: * bits 52 - 31 = 21, 0b10101 * RTS encoding details * bits 0 - 3 of rts -> bits 6 - 8 unsigned long * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long */ #define RTS_SIZE ((0x2UL << 61) | (0x5UL << 5)) static int powernv_enabled = 1; static __always_inline void tlbiel_radix_set_isa300(uint32_t set, uint32_t is, uint32_t pid, uint32_t ric, uint32_t prs) { uint64_t rb; uint64_t rs; rb = PPC_BITLSHIFT_VAL(set, 51) | PPC_BITLSHIFT_VAL(is, 53); rs = PPC_BITLSHIFT_VAL((uint64_t)pid, 31); __asm __volatile(PPC_TLBIEL(%0, %1, %2, %3, 1) : : "r"(rb), "r"(rs), "i"(ric), "i"(prs) : "memory"); } static void tlbiel_flush_isa3(uint32_t num_sets, uint32_t is) { uint32_t set; __asm __volatile("ptesync": : :"memory"); /* * Flush the first set of the TLB, and the entire Page Walk Cache * and partition table entries. Then flush the remaining sets of the * TLB. */ tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); for (set = 1; set < num_sets; set++) tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); /* Do the same for process scoped entries. */ tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); for (set = 1; set < num_sets; set++) tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); __asm __volatile("ptesync": : :"memory"); } static void mmu_radix_tlbiel_flush(int scope) { int is; MPASS(scope == TLB_INVAL_SCOPE_LPID || scope == TLB_INVAL_SCOPE_GLOBAL); is = scope + 2; tlbiel_flush_isa3(POWER9_TLB_SETS_RADIX, is); __asm __volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); } static void mmu_radix_tlbie_all() { /* TODO: LPID invalidate */ mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); } static void mmu_radix_init_amor(void) { /* * In HV mode, we init AMOR (Authority Mask Override Register) so that * the hypervisor and guest can setup IAMR (Instruction Authority Mask * Register), enable key 0 and set it to 1. * * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) */ mtspr(SPR_AMOR, (3ul << 62)); } static void mmu_radix_init_iamr(void) { /* * Radix always uses key0 of the IAMR to determine if an access is * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction * fetch. */ mtspr(SPR_IAMR, (1ul << 62)); } static void mmu_radix_pid_set(pmap_t pmap) { mtspr(SPR_PID, pmap->pm_pid); isync(); } /* Quick sort callout for comparing physical addresses. */ static int pa_cmp(const void *a, const void *b) { const vm_paddr_t *pa = a, *pb = b; if (*pa < *pb) return (-1); else if (*pa > *pb) return (1); else return (0); } #define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte) #define pte_load_clear(ptep) atomic_swap_long(ptep, 0) #define pte_store(ptep, pte) do { \ MPASS((pte) & (RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_X)); \ *(u_long *)(ptep) = (u_long)((pte) | PG_V | RPTE_LEAF); \ } while (0) /* * NB: should only be used for adding directories - not for direct mappings */ #define pde_store(ptep, pa) do { \ *(u_long *)(ptep) = (u_long)(pa|RPTE_VALID|RPTE_SHIFT); \ } while (0) #define pte_clear(ptep) do { \ *(u_long *)(ptep) = (u_long)(0); \ } while (0) #define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */ /* * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB * (PTE) page mappings have identical settings for the following fields: */ #define PG_PTE_PROMOTE (PG_X | PG_MANAGED | PG_W | PG_PTE_CACHE | \ PG_M | PG_A | RPTE_EAA_MASK | PG_V) static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } static void pagezero(vm_offset_t va) { va = trunc_page(va); bzero((void *)va, PAGE_SIZE); } static uint64_t allocpages(int n) { u_int64_t ret; ret = moea64_bootstrap_alloc(n * PAGE_SIZE, PAGE_SIZE); for (int i = 0; i < n; i++) pagezero(PHYS_TO_DMAP(ret + i * PAGE_SIZE)); return (ret); } static pt_entry_t * kvtopte(vm_offset_t va) { pt_entry_t *l3e; l3e = pmap_pml3e(kernel_pmap, va); if ((*l3e & RPTE_VALID) == 0) return (NULL); return (pmap_l3e_to_pte(l3e, va)); } void mmu_radix_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = kvtopte(va); MPASS(pte != NULL); *pte = pa | RPTE_VALID | RPTE_LEAF | RPTE_EAA_R | RPTE_EAA_W | \ RPTE_EAA_P | PG_M | PG_A; } bool mmu_radix_ps_enabled(pmap_t pmap) { return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); } static pt_entry_t * pmap_nofault_pte(pmap_t pmap, vm_offset_t va, int *is_l3e) { pml3_entry_t *l3e; pt_entry_t *pte; va &= PG_PS_FRAME; l3e = pmap_pml3e(pmap, va); if (l3e == NULL || (*l3e & PG_V) == 0) return (NULL); if (*l3e & RPTE_LEAF) { *is_l3e = 1; return (l3e); } *is_l3e = 0; va &= PG_FRAME; pte = pmap_l3e_to_pte(l3e, va); if (pte == NULL || (*pte & PG_V) == 0) return (NULL); return (pte); } int pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags) { pt_entry_t *pte; pt_entry_t startpte, origpte, newpte; vm_page_t m; int is_l3e; startpte = 0; retry: if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL) return (KERN_INVALID_ADDRESS); origpte = newpte = *pte; if (startpte == 0) { startpte = origpte; if (((flags & VM_PROT_WRITE) && (startpte & PG_M)) || ((flags & VM_PROT_READ) && (startpte & PG_A))) { pmap_invalidate_all(pmap); #ifdef INVARIANTS if (VERBOSE_PMAP || pmap_logging) printf("%s(%p, %#lx, %#x) (%#lx) -- invalidate all\n", __func__, pmap, va, flags, origpte); #endif return (KERN_FAILURE); } } #ifdef INVARIANTS if (VERBOSE_PMAP || pmap_logging) printf("%s(%p, %#lx, %#x) (%#lx)\n", __func__, pmap, va, flags, origpte); #endif PMAP_LOCK(pmap); if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL || *pte != origpte) { PMAP_UNLOCK(pmap); return (KERN_FAILURE); } m = PHYS_TO_VM_PAGE(newpte & PG_FRAME); MPASS(m != NULL); switch (flags) { case VM_PROT_READ: if ((newpte & (RPTE_EAA_R|RPTE_EAA_X)) == 0) goto protfail; newpte |= PG_A; vm_page_aflag_set(m, PGA_REFERENCED); break; case VM_PROT_WRITE: if ((newpte & RPTE_EAA_W) == 0) goto protfail; if (is_l3e) goto protfail; newpte |= PG_M; vm_page_dirty(m); break; case VM_PROT_EXECUTE: if ((newpte & RPTE_EAA_X) == 0) goto protfail; newpte |= PG_A; vm_page_aflag_set(m, PGA_REFERENCED); break; } if (!atomic_cmpset_long(pte, origpte, newpte)) goto retry; ptesync(); PMAP_UNLOCK(pmap); if (startpte == newpte) return (KERN_FAILURE); return (0); protfail: PMAP_UNLOCK(pmap); return (KERN_PROTECTION_FAILURE); } /* * Returns TRUE if the given page is mapped individually or as part of * a 2mpage. Otherwise, returns FALSE. */ boolean_t mmu_radix_page_is_mapped(vm_page_t m) { struct rwlock *lock; boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_runlock(lock); return (rv); } /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ static int pmap_cache_bits(vm_memattr_t ma) { if (ma != VM_MEMATTR_DEFAULT) { switch (ma) { case VM_MEMATTR_UNCACHEABLE: return (RPTE_ATTR_GUARDEDIO); case VM_MEMATTR_CACHEABLE: return (RPTE_ATTR_MEM); case VM_MEMATTR_WRITE_BACK: case VM_MEMATTR_PREFETCHABLE: case VM_MEMATTR_WRITE_COMBINING: return (RPTE_ATTR_UNGUARDEDIO); } } return (0); } static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start) { ptesync(); if (pmap == kernel_pmap) radix_tlbie_invlpg_kernel_4k(start); else radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); ttusync(); } static void pmap_invalidate_page_2m(pmap_t pmap, vm_offset_t start) { ptesync(); if (pmap == kernel_pmap) radix_tlbie_invlpg_kernel_2m(start); else radix_tlbie_invlpg_user_2m(pmap->pm_pid, start); ttusync(); } static void pmap_invalidate_pwc(pmap_t pmap) { ptesync(); if (pmap == kernel_pmap) radix_tlbie_invlpwc_kernel(); else radix_tlbie_invlpwc_user(pmap->pm_pid); ttusync(); } static void pmap_invalidate_range(pmap_t pmap, vm_offset_t start, vm_offset_t end) { if (((start - end) >> PAGE_SHIFT) > 8) { pmap_invalidate_all(pmap); return; } ptesync(); if (pmap == kernel_pmap) { while (start < end) { radix_tlbie_invlpg_kernel_4k(start); start += PAGE_SIZE; } } else { while (start < end) { radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); start += PAGE_SIZE; } } ttusync(); } static void pmap_invalidate_all(pmap_t pmap) { ptesync(); if (pmap == kernel_pmap) radix_tlbie_flush_kernel(); else radix_tlbie_flush_user(pmap->pm_pid); ttusync(); } static void pmap_invalidate_l3e_page(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e) { /* * When the PDE has PG_PROMOTED set, the 2MB page mapping was created * by a promotion that did not invalidate the 512 4KB page mappings * that might exist in the TLB. Consequently, at this point, the TLB * may hold both 4KB and 2MB page mappings for the address range [va, * va + L3_PAGE_SIZE). Therefore, the entire range must be invalidated here. * In contrast, when PG_PROMOTED is clear, the TLB will not hold any * 4KB page mappings for the address range [va, va + L3_PAGE_SIZE), and so a * single INVLPG suffices to invalidate the 2MB page mapping from the * TLB. */ ptesync(); if ((l3e & PG_PROMOTED) != 0) pmap_invalidate_range(pmap, va, va + L3_PAGE_SIZE - 1); else pmap_invalidate_page_2m(pmap, va); pmap_invalidate_pwc(pmap); } static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0x3ffffffffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1 }; /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; vm_page_t m; int avail, free; bool reclaimed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { // if ((cpu_feature2 & CPUID2_POPCNT) == 0) bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); #if 0 free = popcnt_pc_map_pq(pc->pc_map); #endif if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { #ifdef INVARIANTS if (PV_PMAP(pv) == NULL) { printf("corrupted pv_chunk/pv %p\n", pv); printf("pv_chunk: %64D\n", pv_to_chunk(pv), ":"); } MPASS(PV_PMAP(pv) != NULL); MPASS(pv->pv_va != 0); #endif if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((pa & L3_PAGE_MASK) == 0, ("pmap_pv_demote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); m->md.pv_gen++; /* Instantiate the remaining NPTEPG - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); va_last = va + L3_PAGE_SIZE - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 , ("pmap_pv_demote_pde: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = cnttzd(pc->pc_map[field]); pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); } static void reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap) { if (pmap == NULL) return; pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static int active_reclaims = 0; static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { struct pv_chunk *pc, *pc_marker, *pc_marker_end; struct pv_chunk_header pc_marker_b, pc_marker_end_b; struct md_page *pvh; pml3_entry_t *l3e; pmap_t next_pmap, pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); bzero(&pc_marker_b, sizeof(pc_marker_b)); bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); pc_marker = (struct pv_chunk *)&pc_marker_b; pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; mtx_lock(&pv_chunks_mutex); active_reclaims++; TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && SLIST_EMPTY(&free)) { next_pmap = pc->pc_pmap; if (next_pmap == NULL) { /* * The next chunk is a marker. However, it is * not our marker, so active_reclaims must be * > 1. Consequently, the next_chunk code * will not rotate the pv_chunks list. */ goto next_chunk; } mtx_unlock(&pv_chunks_mutex); /* * A pv_chunk can only be removed from the pc_lru list * when both pc_chunks_mutex is owned and the * corresponding pmap is locked. */ if (pmap != next_pmap) { reclaim_pv_chunk_leave_pmap(pmap, locked_pmap); pmap = next_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); mtx_lock(&pv_chunks_mutex); continue; } else if (pmap != locked_pmap) { if (PMAP_TRYLOCK(pmap)) { mtx_lock(&pv_chunks_mutex); continue; } else { pmap = NULL; /* pmap is not locked */ mtx_lock(&pv_chunks_mutex); pc = TAILQ_NEXT(pc_marker, pc_lru); if (pc == NULL || pc->pc_pmap != next_pmap) continue; goto next_chunk; } } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = cnttzd(inuse); pv = &pc->pc_pventry[field * 64 + bit]; va = pv->pv_va; l3e = pmap_pml3e(pmap, va); if ((*l3e & RPTE_LEAF) != 0) continue; pte = pmap_l3e_to_pte(l3e, va); if ((*pte & PG_W) != 0) continue; tpte = pte_load_clear(pte); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, *l3e, &free); freed++; } } if (freed == 0) { mtx_lock(&pv_chunks_mutex); goto next_chunk; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1) { PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); mtx_lock(&pv_chunks_mutex); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; next_chunk: TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); if (active_reclaims == 1 && pmap != NULL) { /* * Rotate the pv chunks list so that we do not * scan the same pv chunks that could not be * freed (because they contained a wired * and/or superpage mapping) on every * invocation of reclaim_pv_chunk(). */ while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { MPASS(pc->pc_pmap != NULL); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); } } } TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); active_reclaims--; mtx_unlock(&pv_chunks_mutex); reclaim_pv_chunk_leave_pmap(pmap, locked_pmap); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->ref_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; #ifdef VERBOSE_PV if (pmap != kernel_pmap) printf("%s(%p, %p)\n", __func__, pmap, pv); #endif PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); vm_page_unwire_noq(m); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = cnttzd(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); MPASS(PV_PMAP(pv) != NULL); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); MPASS(PV_PMAP(pv) != NULL); return (pv); } #if VM_NRESERVLEVEL > 0 /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & L3_PAGE_MASK) == 0, ("pmap_pv_promote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + L3_PAGE_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); m->md.pv_gen++; return (TRUE); } else return (FALSE); } vm_paddr_t phys_avail_debug[2 * VM_PHYSSEG_MAX]; #ifdef INVARIANTS static void validate_addr(vm_paddr_t addr, vm_size_t size) { vm_paddr_t end = addr + size; bool found = false; for (int i = 0; i < 2 * phys_avail_count; i += 2) { if (addr >= phys_avail_debug[i] && end <= phys_avail_debug[i + 1]) { found = true; break; } } KASSERT(found, ("%#lx-%#lx outside of initial phys_avail array", addr, end)); } #else static void validate_addr(vm_paddr_t addr, vm_size_t size) {} #endif #define DMAP_PAGE_BITS (RPTE_VALID | RPTE_LEAF | RPTE_EAA_MASK | PG_M | PG_A) static vm_paddr_t alloc_pt_page(void) { vm_paddr_t page; page = allocpages(1); pagezero(PHYS_TO_DMAP(page)); return (page); } static void mmu_radix_dmap_range(vm_paddr_t start, vm_paddr_t end) { pt_entry_t *pte, pteval; vm_paddr_t page; if (bootverbose) printf("%s %lx -> %lx\n", __func__, start, end); while (start < end) { pteval = start | DMAP_PAGE_BITS; pte = pmap_pml1e(kernel_pmap, PHYS_TO_DMAP(start)); if ((*pte & RPTE_VALID) == 0) { page = alloc_pt_page(); pde_store(pte, page); } pte = pmap_l1e_to_l2e(pte, PHYS_TO_DMAP(start)); if ((start & L2_PAGE_MASK) == 0 && end - start >= L2_PAGE_SIZE) { start += L2_PAGE_SIZE; goto done; } else if ((*pte & RPTE_VALID) == 0) { page = alloc_pt_page(); pde_store(pte, page); } pte = pmap_l2e_to_l3e(pte, PHYS_TO_DMAP(start)); if ((start & L3_PAGE_MASK) == 0 && end - start >= L3_PAGE_SIZE) { start += L3_PAGE_SIZE; goto done; } else if ((*pte & RPTE_VALID) == 0) { page = alloc_pt_page(); pde_store(pte, page); } pte = pmap_l3e_to_pte(pte, PHYS_TO_DMAP(start)); start += PAGE_SIZE; done: pte_store(pte, pteval); } } static void mmu_radix_dmap_populate(vm_size_t hwphyssz) { vm_paddr_t start, end; for (int i = 0; i < pregions_sz; i++) { start = pregions[i].mr_start; end = start + pregions[i].mr_size; if (hwphyssz && start >= hwphyssz) break; if (hwphyssz && hwphyssz < end) end = hwphyssz; mmu_radix_dmap_range(start, end); } } static void mmu_radix_setup_pagetables(vm_size_t hwphyssz) { vm_paddr_t ptpages, pages; pt_entry_t *pte; vm_paddr_t l1phys; bzero(kernel_pmap, sizeof(struct pmap)); PMAP_LOCK_INIT(kernel_pmap); ptpages = allocpages(2); l1phys = moea64_bootstrap_alloc(RADIX_PGD_SIZE, RADIX_PGD_SIZE); validate_addr(l1phys, RADIX_PGD_SIZE); if (bootverbose) printf("l1phys=%lx\n", l1phys); MPASS((l1phys & (RADIX_PGD_SIZE-1)) == 0); for (int i = 0; i < RADIX_PGD_SIZE/PAGE_SIZE; i++) pagezero(PHYS_TO_DMAP(l1phys + i * PAGE_SIZE)); kernel_pmap->pm_pml1 = (pml1_entry_t *)PHYS_TO_DMAP(l1phys); mmu_radix_dmap_populate(hwphyssz); /* * Create page tables for first 128MB of KVA */ pages = ptpages; pte = pmap_pml1e(kernel_pmap, VM_MIN_KERNEL_ADDRESS); *pte = (pages | RPTE_VALID | RPTE_SHIFT); pages += PAGE_SIZE; pte = pmap_l1e_to_l2e(pte, VM_MIN_KERNEL_ADDRESS); *pte = (pages | RPTE_VALID | RPTE_SHIFT); pages += PAGE_SIZE; pte = pmap_l2e_to_l3e(pte, VM_MIN_KERNEL_ADDRESS); /* * the kernel page table pages need to be preserved in * phys_avail and not overlap with previous allocations */ pages = allocpages(nkpt); if (bootverbose) { printf("phys_avail after dmap populate and nkpt allocation\n"); for (int j = 0; j < 2 * phys_avail_count; j+=2) printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", j, phys_avail[j], j + 1, phys_avail[j + 1]); } KPTphys = pages; for (int i = 0; i < nkpt; i++, pte++, pages += PAGE_SIZE) *pte = (pages | RPTE_VALID | RPTE_SHIFT); kernel_vm_end = VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE; if (bootverbose) printf("kernel_pmap pml1 %p\n", kernel_pmap->pm_pml1); /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); } static void mmu_radix_early_bootstrap(vm_offset_t start, vm_offset_t end) { vm_paddr_t kpstart, kpend; vm_size_t physsz, hwphyssz; //uint64_t l2virt; int rm_pavail, proctab_size; int i, j; kpstart = start & ~DMAP_BASE_ADDRESS; kpend = end & ~DMAP_BASE_ADDRESS; /* Get physical memory regions from firmware */ mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); CTR0(KTR_PMAP, "mmu_radix_early_bootstrap: physical memory"); if (2 * VM_PHYSSEG_MAX < regions_sz) panic("mmu_radix_early_bootstrap: phys_avail too small"); if (bootverbose) for (int i = 0; i < regions_sz; i++) printf("regions[%d].mr_start=%lx regions[%d].mr_size=%lx\n", i, regions[i].mr_start, i, regions[i].mr_size); /* * XXX workaround a simulator bug */ for (int i = 0; i < regions_sz; i++) if (regions[i].mr_start & PAGE_MASK) { regions[i].mr_start += PAGE_MASK; regions[i].mr_start &= ~PAGE_MASK; regions[i].mr_size &= ~PAGE_MASK; } if (bootverbose) for (int i = 0; i < pregions_sz; i++) printf("pregions[%d].mr_start=%lx pregions[%d].mr_size=%lx\n", i, pregions[i].mr_start, i, pregions[i].mr_size); phys_avail_count = 0; physsz = 0; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); for (i = 0, j = 0; i < regions_sz; i++) { if (bootverbose) printf("regions[%d].mr_start=%016lx regions[%d].mr_size=%016lx\n", i, regions[i].mr_start, i, regions[i].mr_size); if (regions[i].mr_size < PAGE_SIZE) continue; if (hwphyssz != 0 && (physsz + regions[i].mr_size) >= hwphyssz) { if (physsz < hwphyssz) { phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + (hwphyssz - physsz); physsz = hwphyssz; phys_avail_count++; dump_avail[j] = phys_avail[j]; dump_avail[j + 1] = phys_avail[j + 1]; } break; } phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; dump_avail[j] = phys_avail[j]; dump_avail[j + 1] = phys_avail[j + 1]; phys_avail_count++; physsz += regions[i].mr_size; j += 2; } /* Check for overlap with the kernel and exception vectors */ rm_pavail = 0; for (j = 0; j < 2 * phys_avail_count; j+=2) { if (phys_avail[j] < EXC_LAST) phys_avail[j] += EXC_LAST; if (phys_avail[j] >= kpstart && phys_avail[j + 1] <= kpend) { phys_avail[j] = phys_avail[j + 1] = ~0; rm_pavail++; continue; } if (kpstart >= phys_avail[j] && kpstart < phys_avail[j + 1]) { if (kpend < phys_avail[j + 1]) { phys_avail[2 * phys_avail_count] = (kpend & ~PAGE_MASK) + PAGE_SIZE; phys_avail[2 * phys_avail_count + 1] = phys_avail[j + 1]; phys_avail_count++; } phys_avail[j + 1] = kpstart & ~PAGE_MASK; } if (kpend >= phys_avail[j] && kpend < phys_avail[j + 1]) { if (kpstart > phys_avail[j]) { phys_avail[2 * phys_avail_count] = phys_avail[j]; phys_avail[2 * phys_avail_count + 1] = kpstart & ~PAGE_MASK; phys_avail_count++; } phys_avail[j] = (kpend & ~PAGE_MASK) + PAGE_SIZE; } } qsort(phys_avail, 2 * phys_avail_count, sizeof(phys_avail[0]), pa_cmp); for (i = 0; i < 2 * phys_avail_count; i++) phys_avail_debug[i] = phys_avail[i]; /* Remove physical available regions marked for removal (~0) */ if (rm_pavail) { phys_avail_count -= rm_pavail; for (i = 2 * phys_avail_count; i < 2*(phys_avail_count + rm_pavail); i+=2) phys_avail[i] = phys_avail[i + 1] = 0; } if (bootverbose) { printf("phys_avail ranges after filtering:\n"); for (j = 0; j < 2 * phys_avail_count; j+=2) printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", j, phys_avail[j], j + 1, phys_avail[j + 1]); } physmem = btoc(physsz); /* XXX assume we're running non-virtualized and * we don't support BHYVE */ if (isa3_pid_bits == 0) isa3_pid_bits = 20; parttab_phys = moea64_bootstrap_alloc(PARTTAB_SIZE, PARTTAB_SIZE); validate_addr(parttab_phys, PARTTAB_SIZE); for (int i = 0; i < PARTTAB_SIZE/PAGE_SIZE; i++) pagezero(PHYS_TO_DMAP(parttab_phys + i * PAGE_SIZE)); proctab_size = 1UL << PROCTAB_SIZE_SHIFT; proctab0pa = moea64_bootstrap_alloc(proctab_size, proctab_size); validate_addr(proctab0pa, proctab_size); for (int i = 0; i < proctab_size/PAGE_SIZE; i++) pagezero(PHYS_TO_DMAP(proctab0pa + i * PAGE_SIZE)); mmu_radix_setup_pagetables(hwphyssz); } static void mmu_radix_late_bootstrap(vm_offset_t start, vm_offset_t end) { int i; vm_paddr_t pa; void *dpcpu; vm_offset_t va; /* * Set up the Open Firmware pmap and add its mappings if not in real * mode. */ if (bootverbose) printf("%s enter\n", __func__); /* * Calculate the last available physical address, and reserve the * vm_page_array (upper bound). */ Maxmem = 0; for (i = 0; phys_avail[i + 2] != 0; i += 2) Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1])); /* * Set the start and end of kva. */ virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; /* * Remap any early IO mappings (console framebuffer, etc.) */ bs_remap_earlyboot(); /* * Allocate a kernel stack with a guard page for thread0 and map it * into the kernel page map. */ pa = allocpages(kstack_pages); va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; virtual_avail = va + kstack_pages * PAGE_SIZE; CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); thread0.td_kstack = va; for (i = 0; i < kstack_pages; i++) { mmu_radix_kenter(va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } thread0.td_kstack_pages = kstack_pages; /* * Allocate virtual address space for the message buffer. */ pa = msgbuf_phys = allocpages((msgbufsize + PAGE_MASK) >> PAGE_SHIFT); msgbufp = (struct msgbuf *)PHYS_TO_DMAP(pa); /* * Allocate virtual address space for the dynamic percpu area. */ pa = allocpages(DPCPU_SIZE >> PAGE_SHIFT); dpcpu = (void *)PHYS_TO_DMAP(pa); dpcpu_init(dpcpu, curcpu); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ } static void mmu_parttab_init(void) { uint64_t ptcr; isa3_parttab = (struct pate *)PHYS_TO_DMAP(parttab_phys); if (bootverbose) printf("%s parttab: %p\n", __func__, isa3_parttab); ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); if (bootverbose) printf("setting ptcr %lx\n", ptcr); mtspr(SPR_PTCR, ptcr); } static void mmu_parttab_update(uint64_t lpid, uint64_t pagetab, uint64_t proctab) { uint64_t prev; if (bootverbose) printf("%s isa3_parttab %p lpid %lx pagetab %lx proctab %lx\n", __func__, isa3_parttab, lpid, pagetab, proctab); prev = be64toh(isa3_parttab[lpid].pagetab); isa3_parttab[lpid].pagetab = htobe64(pagetab); isa3_parttab[lpid].proctab = htobe64(proctab); if (prev & PARTTAB_HR) { __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); } else { __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); } ttusync(); } static void mmu_radix_parttab_init(void) { uint64_t pagetab; mmu_parttab_init(); pagetab = RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | \ RADIX_PGD_INDEX_SHIFT | PARTTAB_HR; mmu_parttab_update(0, pagetab, 0); } static void mmu_radix_proctab_register(vm_paddr_t proctabpa, uint64_t table_size) { uint64_t pagetab, proctab; pagetab = be64toh(isa3_parttab[0].pagetab); proctab = proctabpa | table_size | PARTTAB_GR; mmu_parttab_update(0, pagetab, proctab); } static void mmu_radix_proctab_init(void) { isa3_base_pid = 1; isa3_proctab = (void*)PHYS_TO_DMAP(proctab0pa); isa3_proctab->proctab0 = htobe64(RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | RADIX_PGD_INDEX_SHIFT); mmu_radix_proctab_register(proctab0pa, PROCTAB_SIZE_SHIFT - 12); __asm __volatile("ptesync" : : : "memory"); __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); __asm __volatile("eieio; tlbsync; ptesync" : : : "memory"); if (bootverbose) printf("process table %p and kernel radix PDE: %p\n", isa3_proctab, kernel_pmap->pm_pml1); mtmsr(mfmsr() | PSL_DR ); mtmsr(mfmsr() & ~PSL_DR); kernel_pmap->pm_pid = isa3_base_pid; isa3_base_pid++; } void mmu_radix_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { struct rwlock *lock; pml1_entry_t *l1e; pml2_entry_t *l2e; pml3_entry_t oldl3e, *l3e; pt_entry_t *pte; vm_offset_t va, va_next; vm_page_t m; boolean_t anychanged; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1e = pmap_pml1e(pmap, sva); if ((*l1e & PG_V) == 0) { va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; if (va_next < sva) va_next = eva; continue; } l2e = pmap_l1e_to_l2e(l1e, sva); if ((*l2e & PG_V) == 0) { va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; if (va_next < sva) va_next = eva; l3e = pmap_l2e_to_l3e(l2e, sva); oldl3e = *l3e; if ((oldl3e & PG_V) == 0) continue; else if ((oldl3e & RPTE_LEAF) != 0) { if ((oldl3e & PG_MANAGED) == 0) continue; lock = NULL; if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) { if (lock != NULL) rw_wunlock(lock); /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying page * table page is fully populated, this removal never * frees a page table page. */ if ((oldl3e & PG_W) == 0) { pte = pmap_l3e_to_pte(l3e, sva); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, sva, *l3e, NULL, &lock); anychanged = TRUE; } if (lock != NULL) rw_wunlock(lock); } if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, sva += PAGE_SIZE) { MPASS(pte == pmap_pte(pmap, sva)); if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_long(pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_long(pte, PG_A); else goto maybe_invlrng; anychanged = TRUE; continue; maybe_invlrng: if (va != va_next) { anychanged = true; va = va_next; } } if (va != va_next) anychanged = true; } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } /* * Routines used in machine-dependent code */ static void mmu_radix_bootstrap(vm_offset_t start, vm_offset_t end) { uint64_t lpcr; if (bootverbose) printf("%s\n", __func__); hw_direct_map = 1; mmu_radix_early_bootstrap(start, end); if (bootverbose) printf("early bootstrap complete\n"); if (powernv_enabled) { lpcr = mfspr(SPR_LPCR); mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); mmu_radix_parttab_init(); mmu_radix_init_amor(); if (bootverbose) printf("powernv init complete\n"); } mmu_radix_init_iamr(); mmu_radix_proctab_init(); mmu_radix_pid_set(kernel_pmap); /* XXX assume CPU_FTR_HVMODE */ mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); mmu_radix_late_bootstrap(start, end); numa_mem_regions(&numa_pregions, &numa_pregions_sz); if (bootverbose) printf("%s done\n", __func__); pmap_bootstrapped = 1; dmaplimit = roundup2(powerpc_ptob(Maxmem), L2_PAGE_SIZE); PCPU_SET(flags, PCPU_GET(flags) | PC_FLAG_NOSRS); } static void mmu_radix_cpu_bootstrap(int ap) { uint64_t lpcr; uint64_t ptcr; if (powernv_enabled) { lpcr = mfspr(SPR_LPCR); mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); mtspr(SPR_PTCR, ptcr); mmu_radix_init_amor(); } mmu_radix_init_iamr(); mmu_radix_pid_set(kernel_pmap); mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); } static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3e, CTLFLAG_RD, 0, "2MB page mapping counters"); static u_long pmap_l3e_demotions; SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, demotions, CTLFLAG_RD, &pmap_l3e_demotions, 0, "2MB page demotions"); static u_long pmap_l3e_mappings; SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, mappings, CTLFLAG_RD, &pmap_l3e_mappings, 0, "2MB page mappings"); static u_long pmap_l3e_p_failures; SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_l3e_p_failures, 0, "2MB page promotion failures"); static u_long pmap_l3e_promotions; SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, promotions, CTLFLAG_RD, &pmap_l3e_promotions, 0, "2MB page promotions"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2e, CTLFLAG_RD, 0, "1GB page mapping counters"); static u_long pmap_l2e_demotions; SYSCTL_ULONG(_vm_pmap_l2e, OID_AUTO, demotions, CTLFLAG_RD, &pmap_l2e_demotions, 0, "1GB page demotions"); void mmu_radix_clear_modify(vm_page_t m) { struct md_page *pvh; pmap_t pmap; pv_entry_t next_pv, pv; pml3_entry_t oldl3e, *l3e; pt_entry_t oldpte, *pte; struct rwlock *lock; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); CTR2(KTR_PMAP, "%s(%p)", __func__, m); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->a.flags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } va = pv->pv_va; l3e = pmap_pml3e(pmap, va); oldl3e = *l3e; if ((oldl3e & PG_RW) != 0) { if (pmap_demote_l3e_locked(pmap, l3e, va, &lock)) { if ((oldl3e & PG_W) == 0) { /* * Write protect the mapping to a * single page so that a subsequent * write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldl3e & PG_PS_FRAME); pte = pmap_l3e_to_pte(l3e, va); oldpte = *pte; if ((oldpte & PG_V) != 0) { while (!atomic_cmpset_long(pte, oldpte, (oldpte | RPTE_EAA_R) & ~(PG_M | PG_RW))) oldpte = *pte; vm_page_dirty(m); pmap_invalidate_page(pmap, va); } } } } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l3e = pmap_pml3e(pmap, pv->pv_va); KASSERT((*l3e & RPTE_LEAF) == 0, ("pmap_clear_modify: found" " a 2mpage in page %p's pv list", m)); pte = pmap_l3e_to_pte(l3e, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { atomic_clear_long(pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); } void mmu_radix_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct rwlock *lock; struct spglist free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t va_next; vm_page_t dst_pdpg, dstmpte, srcmpte; bool invalidate_all; CTR6(KTR_PMAP, "%s(dst_pmap=%p, src_pmap=%p, dst_addr=%lx, len=%lu, src_addr=%lx)\n", __func__, dst_pmap, src_pmap, dst_addr, len, src_addr); if (dst_addr != src_addr) return; lock = NULL; invalidate_all = false; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } for (addr = src_addr; addr < end_addr; addr = va_next) { pml1_entry_t *l1e; pml2_entry_t *l2e; pml3_entry_t srcptepaddr, *l3e; pt_entry_t *src_pte, *dst_pte; l1e = pmap_pml1e(src_pmap, addr); if ((*l1e & PG_V) == 0) { va_next = (addr + L1_PAGE_SIZE) & ~L1_PAGE_MASK; if (va_next < addr) va_next = end_addr; continue; } l2e = pmap_l1e_to_l2e(l1e, addr); if ((*l2e & PG_V) == 0) { va_next = (addr + L2_PAGE_SIZE) & ~L2_PAGE_MASK; if (va_next < addr) va_next = end_addr; continue; } va_next = (addr + L3_PAGE_SIZE) & ~L3_PAGE_MASK; if (va_next < addr) va_next = end_addr; l3e = pmap_l2e_to_l3e(l2e, addr); srcptepaddr = *l3e; if (srcptepaddr == 0) continue; if (srcptepaddr & RPTE_LEAF) { if ((addr & L3_PAGE_MASK) != 0 || addr + L3_PAGE_SIZE > end_addr) continue; dst_pdpg = pmap_allocl3e(dst_pmap, addr, NULL); if (dst_pdpg == NULL) break; l3e = (pml3_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); l3e = &l3e[pmap_pml3e_index(addr)]; if (*l3e == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_l3e(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM, &lock))) { *l3e = srcptepaddr & ~PG_W; pmap_resident_count_inc(dst_pmap, L3_PAGE_SIZE / PAGE_SIZE); atomic_add_long(&pmap_l3e_mappings, 1); } else dst_pdpg->ref_count--; continue; } srcptepaddr &= PG_FRAME; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); KASSERT(srcmpte->ref_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); src_pte = &src_pte[pmap_pte_index(addr)]; dstmpte = NULL; while (addr < va_next) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { if (dstmpte != NULL && dstmpte->pindex == pmap_l3e_pindex(addr)) dstmpte->ref_count++; else if ((dstmpte = pmap_allocpte(dst_pmap, addr, NULL)) == NULL) goto out; dst_pte = (pt_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); dst_pte = &dst_pte[pmap_pte_index(addr)]; if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); pmap_resident_count_inc(dst_pmap, 1); } else { SLIST_INIT(&free); if (pmap_unwire_ptp(dst_pmap, addr, dstmpte, &free)) { /* * Although "addr" is not * mapped, paging-structure * caches could nonetheless * have entries that refer to * the freed page table pages. * Invalidate those entries. */ invalidate_all = true; vm_page_free_pages_toq(&free, true); } goto out; } if (dstmpte->ref_count >= srcmpte->ref_count) break; } addr += PAGE_SIZE; if (__predict_false((addr & L3_PAGE_MASK) == 0)) src_pte = pmap_pte(src_pmap, addr); else src_pte++; } } out: if (invalidate_all) pmap_invalidate_all(dst_pmap); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } static void mmu_radix_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst); /* * XXX slow */ bcopy((void *)src, (void *)dst, PAGE_SIZE); } static void mmu_radix_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma, a_offset, mb, b_offset, xfersize); UNIMPLEMENTED(); } #if VM_NRESERVLEVEL > 0 /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single page table page (PTP) to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static int pmap_promote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { pml3_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2MB page. */ firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & L3_PAGE_MASK) | PG_A | PG_V)) != (PG_A | PG_V)) { CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" " in pmap %p", va, pmap); goto fail; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_long(firstpte, newpde, (newpde | RPTE_EAA_R) & ~RPTE_EAA_W)) goto setpde; newpde &= ~RPTE_EAA_W; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + L3_PAGE_SIZE - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" " in pmap %p", va, pmap); goto fail; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_long(pte, oldpte, (oldpte | RPTE_EAA_R) & ~RPTE_EAA_W)) goto setpte; oldpte &= ~RPTE_EAA_W; CTR2(KTR_PMAP, "pmap_promote_l3e: protect for va %#lx" " in pmap %p", (oldpte & PG_FRAME & L3_PAGE_MASK) | (va & ~L3_PAGE_MASK), pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" " in pmap %p", va, pmap); goto fail; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_l3e: page table page is out of range")); KASSERT(mpte->pindex == pmap_l3e_pindex(va), ("pmap_promote_l3e: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte)) { CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx in pmap %p", va, pmap); goto fail; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_l3e(pmap, va, newpde & PG_PS_FRAME, lockp); pte_store(pde, PG_PROMOTED | newpde); atomic_add_long(&pmap_l3e_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_l3e: success for va %#lx" " in pmap %p", va, pmap); return (0); fail: atomic_add_long(&pmap_l3e_p_failures, 1); return (KERN_FAILURE); } #endif /* VM_NRESERVLEVEL > 0 */ int mmu_radix_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pml3_entry_t *l3e; pt_entry_t *pte; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; int rv, retrycount; boolean_t nosleep, invalidate_all, invalidate_page; va = trunc_page(va); retrycount = 0; invalidate_page = invalidate_all = false; CTR6(KTR_PMAP, "pmap_enter(%p, %#lx, %p, %#x, %#x, %d)", pmap, va, m, prot, flags, psind); KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); KASSERT((flags & PMAP_ENTER_RESERVED) == 0, ("pmap_enter: flags %u has reserved bits set", flags)); pa = VM_PAGE_TO_PHYS(m); newpte = (pt_entry_t)(pa | PG_A | PG_V | RPTE_LEAF); if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if ((flags & VM_PROT_READ) != 0) newpte |= PG_A; if (prot & VM_PROT_READ) newpte |= RPTE_EAA_R; if ((prot & VM_PROT_WRITE) != 0) newpte |= RPTE_EAA_W; KASSERT((newpte & (PG_M | PG_RW)) != PG_M, ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); if (prot & VM_PROT_EXECUTE) newpte |= PG_X; if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PG_W; if (va >= DMAP_MIN_ADDRESS) newpte |= RPTE_EAA_P; newpte |= pmap_cache_bits(m->md.mdpg_cache_attrs); /* * Set modified bit gratuitously for writeable mappings if * the page is unmanaged. We do not want to take a fault * to do the dirty bit accounting for these mappings. */ if ((m->oflags & VPO_UNMANAGED) != 0) { if ((newpte & PG_RW) != 0) newpte |= PG_M; } else newpte |= PG_MANAGED; lock = NULL; PMAP_LOCK(pmap); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & L3_PAGE_MASK) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_l3e(pmap, va, newpte | RPTE_LEAF, flags, m, &lock); goto out; } mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ retry: l3e = pmap_pml3e(pmap, va); if (l3e != NULL && (*l3e & PG_V) != 0 && ((*l3e & RPTE_LEAF) == 0 || pmap_demote_l3e_locked(pmap, l3e, va, &lock))) { pte = pmap_l3e_to_pte(l3e, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(*l3e & PG_FRAME); mpte->ref_count++; } } else if (va < VM_MAXUSER_ADDRESS) { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = _pmap_allocpte(pmap, pmap_l3e_pindex(va), nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { rv = KERN_RESOURCE_SHORTAGE; goto out; } if (__predict_false(retrycount++ == 6)) panic("too many retries"); invalidate_all = true; goto retry; } else panic("pmap_enter: invalid page directory va=%#lx", va); origpte = *pte; pv = NULL; /* * Is the specified virtual address already mapped? */ if ((origpte & PG_V) != 0) { #ifdef INVARIANTS if (VERBOSE_PMAP || pmap_logging) { printf("cow fault pmap_enter(%p, %#lx, %p, %#x, %x, %d) --" " asid=%lu curpid=%d name=%s origpte0x%lx\n", pmap, va, m, prot, flags, psind, pmap->pm_pid, curproc->p_pid, curproc->p_comm, origpte); pmap_pte_walk(pmap->pm_pml1, va); } #endif /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) pmap->pm_stats.wired_count++; else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->ref_count--; KASSERT(mpte->ref_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ opa = origpte & PG_FRAME; if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((origpte & PG_MANAGED) != 0 && (newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) { if ((newpte & (PG_A|PG_M)) != (origpte & (PG_A|PG_M))) { if (!atomic_cmpset_long(pte, origpte, newpte)) goto retry; if ((newpte & PG_M) != (origpte & PG_M)) vm_page_dirty(m); if ((newpte & PG_A) != (origpte & PG_A)) vm_page_aflag_set(m, PGA_REFERENCED); ptesync(); } else invalidate_all = true; if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) goto unchanged; } goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. This ensures that all threads sharing the * pmap keep a consistent view of the mapping, which is * necessary for the correct handling of COW faults. It * also permits reuse of the old mapping's PV entry, * avoiding an allocation. * * For consistency, handle unmanaged mappings the same way. */ origpte = pte_load_clear(pte); KASSERT((origpte & PG_FRAME) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((origpte & PG_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(om); if ((origpte & PG_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); if ((newpte & PG_MANAGED) == 0) free_pv_entry(pmap, pv); #ifdef INVARIANTS else if (origpte & PG_MANAGED) { if (pv == NULL) { pmap_page_print_mappings(om); MPASS(pv != NULL); } } #endif if ((om->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } if ((origpte & PG_A) != 0) invalidate_page = true; origpte = 0; } else { if (pmap != kernel_pmap) { #ifdef INVARIANTS if (VERBOSE_PMAP || pmap_logging) printf("pmap_enter(%p, %#lx, %p, %#x, %x, %d) -- asid=%lu curpid=%d name=%s\n", pmap, va, m, prot, flags, psind, pmap->pm_pid, curproc->p_pid, curproc->p_comm); #endif } /* * Increment the counters. */ if ((newpte & PG_W) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((newpte & PG_MANAGED) != 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } #ifdef VERBOSE_PV else printf("reassigning pv: %p to pmap: %p\n", pv, pmap); #endif CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); m->md.pv_gen++; if ((newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the PTE. */ if ((origpte & PG_V) != 0) { validate: origpte = pte_load_store(pte, newpte); KASSERT((origpte & PG_FRAME) == pa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(m); invalidate_page = true; /* * Although the PTE may still have PG_RW set, TLB * invalidation may nonetheless be required because * the PTE no longer has PG_M set. */ } else if ((origpte & PG_X) != 0 || (newpte & PG_X) == 0) { /* * Removing capabilities requires invalidation on POWER */ invalidate_page = true; goto unchanged; } if ((origpte & PG_A) != 0) invalidate_page = true; } else { pte_store(pte, newpte); ptesync(); } unchanged: #if VM_NRESERVLEVEL > 0 /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->ref_count == NPTEPG) && mmu_radix_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0 && pmap_promote_l3e(pmap, l3e, va, &lock) == 0) invalidate_all = true; #endif if (invalidate_all) pmap_invalidate_all(pmap); else if (invalidate_page) pmap_invalidate_page(pmap, va); rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pml3_entry_t newpde; PMAP_LOCK_ASSERT(pmap, MA_OWNED); newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs) | RPTE_LEAF | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) newpde |= PG_MANAGED; if (prot & VM_PROT_EXECUTE) newpde |= PG_X; if (prot & VM_PROT_READ) newpde |= RPTE_EAA_R; if (va >= DMAP_MIN_ADDRESS) newpde |= RPTE_EAA_P; return (pmap_enter_l3e(pmap, va, newpde, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pml3_entry_t oldl3e, *l3e; vm_page_t mt, pdpg; KASSERT((newpde & (PG_M | PG_RW)) != PG_RW, ("pmap_enter_pde: newpde is missing PG_M")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pdpg = pmap_allocl3e(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); l3e = &l3e[pmap_pml3e_index(va)]; oldl3e = *l3e; if ((oldl3e & PG_V) != 0) { KASSERT(pdpg->ref_count > 1, ("pmap_enter_pde: pdpg's wire count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { pdpg->ref_count--; CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_FAILURE); } /* Break the existing mapping(s). */ SLIST_INIT(&free); if ((oldl3e & RPTE_LEAF) != 0) { /* * The reference to the PD page that was acquired by * pmap_allocl3e() ensures that it won't be freed. * However, if the PDE resulted from a promotion, then * a reserved PT page could be freed. */ (void)pmap_remove_l3e(pmap, l3e, va, &free, lockp); } else { if (pmap_remove_ptes(pmap, va, va + L3_PAGE_SIZE, l3e, &free, lockp)) pmap_invalidate_all(pmap); } vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { mt = PHYS_TO_VM_PAGE(*l3e & PG_FRAME); if (pmap_insert_pt_page(pmap, mt)) { /* * XXX Currently, this can't happen because * we do not perform pmap_enter(psind == 1) * on the kernel pmap. */ panic("pmap_enter_pde: trie insert failed"); } } else KASSERT(*l3e == 0, ("pmap_enter_pde: non-zero pde %p", l3e)); } if ((newpde & PG_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_l3e(pmap, va, newpde, flags, lockp)) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { /* * Although "va" is not mapped, paging- * structure caches could nonetheless have * entries that refer to the freed page table * pages. Invalidate those entries. */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((newpde & PG_RW) != 0) { for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } } /* * Increment counters. */ if ((newpde & PG_W) != 0) pmap->pm_stats.wired_count += L3_PAGE_SIZE / PAGE_SIZE; pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); /* * Map the superpage. (This is not a promoted mapping; there will not * be any lingering 4KB page mappings in the TLB.) */ pte_store(l3e, newpde); atomic_add_long(&pmap_l3e_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (KERN_SUCCESS); } void mmu_radix_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; bool invalidate; VM_OBJECT_ASSERT_LOCKED(m_start->object); CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start, end, m_start, prot); invalidate = false; psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & L3_PAGE_MASK) == 0 && va + L3_PAGE_SIZE <= end && m->psind == 1 && mmu_radix_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[L3_PAGE_SIZE / PAGE_SIZE - 1]; else mpte = mmu_radix_enter_quick_locked(pmap, va, m, prot, mpte, &lock, &invalidate); m = TAILQ_NEXT(m, listq); } ptesync(); if (lock != NULL) rw_wunlock(lock); if (invalidate) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate) { struct spglist free; pt_entry_t *pte; vm_paddr_t pa; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("mmu_radix_enter_quick_locked: managed mapping within the clean submap")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t ptepindex; pml3_entry_t *ptepa; /* * Calculate pagetable page index */ ptepindex = pmap_l3e_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->ref_count++; } else { /* * Get the page directory entry */ ptepa = pmap_pml3e(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (ptepa && (*ptepa & PG_V) != 0) { if (*ptepa & RPTE_LEAF) return (NULL); mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); mpte->ref_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_allocpte(pmap, ptepindex, NULL); if (mpte == NULL) return (mpte); } } pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); pte = &pte[pmap_pte_index(va)]; } else { mpte = NULL; pte = pmap_pte(pmap, va); } if (*pte) { if (mpte != NULL) { mpte->ref_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, mpte, &free)) { /* * Although "va" is not mapped, paging- * structure caches could nonetheless have * entries that refer to the freed page table * pages. Invalidate those entries. */ *invalidate = true; vm_page_free_pages_toq(&free, true); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs); if (prot & VM_PROT_EXECUTE) pa |= PG_X; else pa |= RPTE_EAA_R; if ((m->oflags & VPO_UNMANAGED) == 0) pa |= PG_MANAGED; pte_store(pte, pa); return (mpte); } void mmu_radix_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; bool invalidate; lock = NULL; invalidate = false; PMAP_LOCK(pmap); mmu_radix_enter_quick_locked(pmap, va, m, prot, NULL, &lock, &invalidate); ptesync(); if (lock != NULL) rw_wunlock(lock); if (invalidate) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } vm_paddr_t mmu_radix_extract(pmap_t pmap, vm_offset_t va) { pml3_entry_t *l3e; pt_entry_t *pte; vm_paddr_t pa; l3e = pmap_pml3e(pmap, va); if (__predict_false(l3e == NULL)) return (0); if (*l3e & RPTE_LEAF) { pa = (*l3e & PG_PS_FRAME) | (va & L3_PAGE_MASK); pa |= (va & L3_PAGE_MASK); } else { /* * Beware of a concurrent promotion that changes the * PDE at this point! For example, vtopte() must not * be used to access the PTE because it would use the * new PDE. It is, however, safe to use the old PDE * because the page table page is preserved by the * promotion. */ pte = pmap_l3e_to_pte(l3e, va); if (__predict_false(pte == NULL)) return (0); pa = *pte; pa = (pa & PG_FRAME) | (va & PAGE_MASK); pa |= (va & PAGE_MASK); } return (pa); } vm_page_t mmu_radix_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pml3_entry_t l3e, *l3ep; pt_entry_t pte; vm_paddr_t pa; vm_page_t m; pa = 0; m = NULL; CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot); PMAP_LOCK(pmap); l3ep = pmap_pml3e(pmap, va); if (l3ep != NULL && (l3e = *l3ep)) { if (l3e & RPTE_LEAF) { if ((l3e & PG_RW) || (prot & VM_PROT_WRITE) == 0) m = PHYS_TO_VM_PAGE((l3e & PG_PS_FRAME) | (va & L3_PAGE_MASK)); } else { pte = *pmap_l3e_to_pte(l3ep, va); if ((pte & PG_V) && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) m = PHYS_TO_VM_PAGE(pte & PG_FRAME); } if (m != NULL && !vm_page_wire_mapped(m)) m = NULL; } PMAP_UNLOCK(pmap); return (m); } static void mmu_radix_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pml3_entry_t *l3e; pml2_entry_t *l2e; CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); if (VM_MIN_KERNEL_ADDRESS < addr && addr < (VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE)) return; addr = roundup2(addr, L3_PAGE_SIZE); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { l2e = pmap_pml2e(kernel_pmap, kernel_vm_end); if ((*l2e & PG_V) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_PAGE_SIZE_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) mmu_radix_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); pde_store(l2e, paddr); continue; /* try again */ } l3e = pmap_l2e_to_l3e(l2e, kernel_vm_end); if ((*l3e & PG_V) != 0) { kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } nkpg = vm_page_alloc(NULL, pmap_l3e_pindex(kernel_vm_end), VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) mmu_radix_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); pde_store(l3e, paddr); kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } ptesync(); } static MALLOC_DEFINE(M_RADIX_PGD, "radix_pgd", "radix page table root directory"); static uma_zone_t zone_radix_pgd; static int radix_pgd_import(void *arg __unused, void **store, int count, int domain __unused, int flags) { for (int i = 0; i < count; i++) { vm_page_t m = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK, RADIX_PGD_SIZE/PAGE_SIZE, 0, (vm_paddr_t)-1, RADIX_PGD_SIZE, L1_PAGE_SIZE, VM_MEMATTR_DEFAULT); /* XXX zero on alloc here so we don't have to later */ store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); } return (count); } static void radix_pgd_release(void *arg __unused, void **store, int count) { vm_page_t m; struct spglist free; int page_count; SLIST_INIT(&free); page_count = RADIX_PGD_SIZE/PAGE_SIZE; for (int i = 0; i < count; i++) { /* * XXX selectively remove dmap and KVA entries so we don't * need to bzero */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i])); for (int j = page_count-1; j >= 0; j--) { vm_page_unwire_noq(&m[j]); SLIST_INSERT_HEAD(&free, &m[j], plinks.s.ss); } vm_page_free_pages_toq(&free, false); } } static void mmu_radix_init() { vm_page_t mpte; vm_size_t s; int error, i, pv_npg; /* L1TF, reserve page @0 unconditionally */ vm_page_blacklist_add(0, bootverbose); zone_radix_pgd = uma_zcache_create("radix_pgd_cache", RADIX_PGD_SIZE, NULL, NULL, #ifdef INVARIANTS trash_init, trash_fini, #else NULL, NULL, #endif radix_pgd_import, radix_pgd_release, NULL, UMA_ZONE_NOBUCKET); /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ PMAP_LOCK(kernel_pmap); for (i = 0; i < nkpt; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range size: %lu", vm_page_array_size)); mpte->pindex = pmap_l3e_pindex(VM_MIN_KERNEL_ADDRESS) + i; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); MPASS(PHYS_TO_VM_PAGE(mpte->phys_addr) == mpte); //pmap_insert_pt_page(kernel_pmap, mpte); mpte->ref_count = 1; } PMAP_UNLOCK(kernel_pmap); vm_wire_add(nkpt); CTR1(KTR_PMAP, "%s()", __func__); TAILQ_INIT(&pv_dummy.pv_list); /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = L3_PAGE_SIZE; } /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L3_PAGE_SIZE); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); pmap_initialized = 1; mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, (vmem_addr_t *)&qframe); if (error != 0) panic("qframe allocation failed"); asid_arena = vmem_create("ASID", isa3_base_pid + 1, (1<md.pv_list, pv_link) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); mask = 0; if (modified) mask |= PG_RW | PG_M; if (accessed) mask |= PG_V | PG_A; rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pml3e(pmap, pv->pv_va); mask = 0; if (modified) mask |= PG_RW | PG_M; if (accessed) mask |= PG_V | PG_A; rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t mmu_radix_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); CTR2(KTR_PMAP, "%s(%p)", __func__, m); /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } boolean_t mmu_radix_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pml3_entry_t *l3e; pt_entry_t *pte; boolean_t rv; CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); rv = FALSE; PMAP_LOCK(pmap); l3e = pmap_pml3e(pmap, addr); if (l3e != NULL && (*l3e & (RPTE_LEAF | PG_V)) == PG_V) { pte = pmap_l3e_to_pte(l3e, addr); rv = (*pte & PG_V) == 0; } PMAP_UNLOCK(pmap); return (rv); } boolean_t mmu_radix_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); CTR2(KTR_PMAP, "%s(%p)", __func__, m); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). * * A DI block is not needed within this function, because * invalidations are performed before the PV list lock is * released. */ boolean_t mmu_radix_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pml3_entry_t oldl3e, *l3e; pt_entry_t *pte; vm_paddr_t pa; int cleared, md_gen, not_cleared, pvh_gen; struct spglist free; CTR2(KTR_PMAP, "%s(%p)", __func__, m); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } l3e = pmap_pml3e(pmap, pv->pv_va); oldl3e = *l3e; if ((oldl3e & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "oldpde" is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((oldl3e & PG_A) != 0) { /* * Since this reference bit is shared by 512 4KB * pages, it should not be cleared every time it is * tested. Apply a simple "hash" function on the * physical page number, the virtual superpage number, * and the pmap address to select one 4KB page out of * the 512 on which testing the reference bit will * result in clearing that reference bit. This * function is designed to avoid the selection of the * same 4KB page for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L3_PAGE_SIZE_SHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (oldl3e & PG_W) == 0) { atomic_clear_long(l3e, PG_A); pmap_invalidate_page(pmap, pv->pv_va); cleared++; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } l3e = pmap_pml3e(pmap, pv->pv_va); KASSERT((*l3e & RPTE_LEAF) == 0, ("pmap_ts_referenced: found a 2mpage in page %p's pv list", m)); pte = pmap_l3e_to_pte(l3e, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { atomic_clear_long(pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); vm_page_free_pages_toq(&free, true); return (cleared + not_cleared); } static vm_offset_t mmu_radix_map(vm_offset_t *virt __unused, vm_paddr_t start, vm_paddr_t end, int prot __unused) { CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end, prot); return (PHYS_TO_DMAP(start)); } void mmu_radix_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pml3_entry_t *l3e; vm_paddr_t pa, ptepa; vm_page_t p, pdpg; vm_memattr_t ma; CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr, object, pindex, size); VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); /* NB: size can be logically ored with addr here */ if ((addr & L3_PAGE_MASK) == 0 && (size & L3_PAGE_MASK) == 0) { if (!mmu_radix_ps_enabled(pmap)) return; if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); ma = p->md.mdpg_cache_attrs; /* * Abort the mapping if the first page is not physically * aligned to a 2MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & L3_PAGE_MASK) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || ma != p->md.mdpg_cache_attrs) return; p = TAILQ_NEXT(p, listq); } PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(ma); pa < ptepa + size; pa += L3_PAGE_SIZE) { pdpg = pmap_allocl3e(pmap, addr, NULL); if (pdpg == NULL) { /* * The creation of mappings below is only an * optimization. If a page directory page * cannot be allocated without blocking, * continue on to the next mapping rather than * blocking. */ addr += L3_PAGE_SIZE; continue; } l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); l3e = &l3e[pmap_pml3e_index(addr)]; if ((*l3e & PG_V) == 0) { pa |= PG_M | PG_A | PG_RW; pte_store(l3e, pa); pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); atomic_add_long(&pmap_l3e_mappings, 1); } else { /* Continue on if the PDE is already valid. */ pdpg->ref_count--; KASSERT(pdpg->ref_count > 0, ("pmap_object_init_pt: missing reference " "to page directory page, va: 0x%lx", addr)); } addr += L3_PAGE_SIZE; } ptesync(); PMAP_UNLOCK(pmap); } } boolean_t mmu_radix_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } void mmu_radix_page_init(vm_page_t m) { CTR2(KTR_PMAP, "%s(%p)", __func__, m); TAILQ_INIT(&m->md.pv_list); m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; } int mmu_radix_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); CTR2(KTR_PMAP, "%s(%p)", __func__, m); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pml3e(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } static void mmu_radix_update_proctab(int pid, pml1_entry_t l1pa) { isa3_proctab[pid].proctab0 = htobe64(RTS_SIZE | l1pa | RADIX_PGD_INDEX_SHIFT); } int mmu_radix_pinit(pmap_t pmap) { vmem_addr_t pid; vm_paddr_t l1pa; CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); /* * allocate the page directory page */ pmap->pm_pml1 = uma_zalloc(zone_radix_pgd, M_WAITOK); for (int j = 0; j < RADIX_PGD_SIZE_SHIFT; j++) pagezero((vm_offset_t)pmap->pm_pml1 + j * PAGE_SIZE); pmap->pm_radix.rt_root = 0; TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = PMAP_PDE_SUPERPAGE; vmem_alloc(asid_arena, 1, M_FIRSTFIT|M_WAITOK, &pid); pmap->pm_pid = pid; l1pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml1); mmu_radix_update_proctab(pid, l1pa); __asm __volatile("ptesync;isync" : : : "memory"); return (1); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, pdppg, pdpg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); vm_wait(NULL); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) mmu_radix_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUPDE + NUPDPE)) { pml1_entry_t *l1e; vm_pindex_t pml1index; /* Wire up a new PDPE page */ pml1index = ptepindex - (NUPDE + NUPDPE); l1e = &pmap->pm_pml1[pml1index]; pde_store(l1e, VM_PAGE_TO_PHYS(m)); } else if (ptepindex >= NUPDE) { vm_pindex_t pml1index; vm_pindex_t pdpindex; pml1_entry_t *l1e; pml2_entry_t *l2e; /* Wire up a new l2e page */ pdpindex = ptepindex - NUPDE; pml1index = pdpindex >> RPTE_SHIFT; l1e = &pmap->pm_pml1[pml1index]; if ((*l1e & PG_V) == 0) { /* Have to allocate a new pdp, recurse */ if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { /* Add reference to l2e page */ pdppg = PHYS_TO_VM_PAGE(*l1e & PG_FRAME); pdppg->ref_count++; } l2e = (pml2_entry_t *)PHYS_TO_DMAP(*l1e & PG_FRAME); /* Now find the pdp page */ l2e = &l2e[pdpindex & RPTE_MASK]; pde_store(l2e, VM_PAGE_TO_PHYS(m)); } else { vm_pindex_t pml1index; vm_pindex_t pdpindex; pml1_entry_t *l1e; pml2_entry_t *l2e; pml3_entry_t *l3e; /* Wire up a new PTE page */ pdpindex = ptepindex >> RPTE_SHIFT; pml1index = pdpindex >> RPTE_SHIFT; /* First, find the pdp and check that its valid. */ l1e = &pmap->pm_pml1[pml1index]; if ((*l1e & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } l2e = (pml2_entry_t *)PHYS_TO_DMAP(*l1e & PG_FRAME); l2e = &l2e[pdpindex & RPTE_MASK]; } else { l2e = (pml2_entry_t *)PHYS_TO_DMAP(*l1e & PG_FRAME); l2e = &l2e[pdpindex & RPTE_MASK]; if ((*l2e & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { /* Add reference to the pd page */ pdpg = PHYS_TO_VM_PAGE(*l2e & PG_FRAME); pdpg->ref_count++; } } l3e = (pml3_entry_t *)PHYS_TO_DMAP(*l2e & PG_FRAME); /* Now we know where the page directory page is */ l3e = &l3e[ptepindex & RPTE_MASK]; pde_store(l3e, VM_PAGE_TO_PHYS(m)); } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t pdpindex, ptepindex; pml2_entry_t *pdpe; vm_page_t pdpg; retry: pdpe = pmap_pml2e(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { /* Add a reference to the pd page. */ pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); pdpg->ref_count++; } else { /* Allocate a pd page. */ ptepindex = pmap_l3e_pindex(va); pdpindex = ptepindex >> RPTE_SHIFT; pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); if (pdpg == NULL && lockp != NULL) goto retry; } return (pdpg); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pml3_entry_t *pd; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = pmap_l3e_pindex(va); retry: /* * Get the page directory entry */ pd = pmap_pml3e(pmap, va); /* * This supports switching from a 2MB page to a * normal 4K page. */ if (pd != NULL && (*pd & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V)) { if (!pmap_demote_l3e_locked(pmap, pd, va, lockp)) { /* * Invalidation of the 2MB page mapping may have caused * the deallocation of the underlying PD page. */ pd = NULL; } } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (pd != NULL && (*pd & PG_V) != 0) { m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); m->ref_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_allocpte(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; } return (m); } static void mmu_radix_pinit0(pmap_t pmap) { CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); PMAP_LOCK_INIT(pmap); pmap->pm_pml1 = kernel_pmap->pm_pml1; pmap->pm_pid = kernel_pmap->pm_pid; pmap->pm_radix.rt_root = 0; TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); kernel_pmap->pm_flags = pmap->pm_flags = PMAP_PDE_SUPERPAGE; } /* * pmap_protect_l3e: do the things to protect a 2mpage in a process */ static boolean_t pmap_protect_l3e(pmap_t pmap, pt_entry_t *l3e, vm_offset_t sva, vm_prot_t prot) { pt_entry_t newpde, oldpde; vm_offset_t eva, va; vm_page_t m; boolean_t anychanged; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L3_PAGE_MASK) == 0, ("pmap_protect_l3e: sva is not 2mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *l3e; if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { eva = sva + L3_PAGE_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) vm_page_dirty(m); } if ((prot & VM_PROT_WRITE) == 0) { newpde &= ~(PG_RW | PG_M); newpde |= RPTE_EAA_R; } if (prot & VM_PROT_EXECUTE) newpde |= PG_X; if (newpde != oldpde) { /* * As an optimization to future operations on this PDE, clear * PG_PROMOTED. The impending invalidation will remove any * lingering 4KB page mappings from the TLB. */ if (!atomic_cmpset_long(l3e, oldpde, newpde & ~PG_PROMOTED)) goto retry; anychanged = TRUE; } return (anychanged); } void mmu_radix_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va_next; pml1_entry_t *l1e; pml2_entry_t *l2e; pml3_entry_t ptpaddr, *l3e; pt_entry_t *pte; boolean_t anychanged; CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, sva, eva, prot); KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { mmu_radix_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; #ifdef INVARIANTS if (VERBOSE_PROTECT || pmap_logging) printf("pmap_protect(%p, %#lx, %#lx, %x) - asid: %lu\n", pmap, sva, eva, prot, pmap->pm_pid); #endif anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1e = pmap_pml1e(pmap, sva); if ((*l1e & PG_V) == 0) { va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; if (va_next < sva) va_next = eva; continue; } l2e = pmap_l1e_to_l2e(l1e, sva); if ((*l2e & PG_V) == 0) { va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; if (va_next < sva) va_next = eva; l3e = pmap_l2e_to_l3e(l2e, sva); ptpaddr = *l3e; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & RPTE_LEAF) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { if (pmap_protect_l3e(pmap, l3e, sva, prot)) anychanged = TRUE; continue; } else if (!pmap_demote_l3e(pmap, l3e, sva)) { /* * The large page mapping was destroyed. */ continue; } } if (va_next > eva) va_next = eva; for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, sva += PAGE_SIZE) { pt_entry_t obits, pbits; vm_page_t m; retry: MPASS(pte == pmap_pte(pmap, sva)); obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); pbits |= RPTE_EAA_R; } if (prot & VM_PROT_EXECUTE) pbits |= PG_X; if (pbits != obits) { if (!atomic_cmpset_long(pte, obits, pbits)) goto retry; if (obits & (PG_A|PG_M)) { anychanged = TRUE; #ifdef INVARIANTS if (VERBOSE_PROTECT || pmap_logging) printf("%#lx %#lx -> %#lx\n", sva, obits, pbits); #endif } } } } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } void mmu_radix_qenter(vm_offset_t sva, vm_page_t *ma, int count) { CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, sva, ma, count); pt_entry_t oldpte, pa, *pte; vm_page_t m; uint64_t cache_bits, attr_bits; vm_offset_t va; oldpte = 0; attr_bits = RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; va = sva; pte = kvtopte(va); while (va < sva + PAGE_SIZE * count) { if (__predict_false((va & L3_PAGE_MASK) == 0)) pte = kvtopte(va); MPASS(pte == pmap_pte(kernel_pmap, va)); /* * XXX there has to be a more efficient way than traversing * the page table every time - but go for correctness for * today */ m = *ma++; cache_bits = pmap_cache_bits(m->md.mdpg_cache_attrs); pa = VM_PAGE_TO_PHYS(m) | cache_bits | attr_bits; if (*pte != pa) { oldpte |= *pte; pte_store(pte, pa); } va += PAGE_SIZE; pte++; } if (__predict_false((oldpte & RPTE_VALID) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); else ptesync(); } void mmu_radix_qremove(vm_offset_t sva, int count) { vm_offset_t va; pt_entry_t *pte; CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, sva, count); KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode or dmap va %lx", sva)); va = sva; pte = kvtopte(va); while (va < sva + PAGE_SIZE * count) { if (__predict_false((va & L3_PAGE_MASK) == 0)) pte = kvtopte(va); pte_clear(pte); pte++; va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_insert(&pmap->pm_radix, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_radix, pmap_l3e_pindex(va))); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->ref_count; if (m->ref_count == 0) { _pmap_unwire_ptp(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= (NUPDE + NUPDPE)) { /* PDP page */ pml1_entry_t *pml1; pml1 = pmap_pml1e(pmap, va); *pml1 = 0; } else if (m->pindex >= NUPDE) { /* PD page */ pml2_entry_t *l2e; l2e = pmap_pml2e(pmap, va); *l2e = 0; } else { /* PTE page */ pml3_entry_t *l3e; l3e = pmap_pml3e(pmap, va); *l3e = 0; } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUPDE) { /* We just released a PT, unhold the matching PD */ vm_page_t pdpg; pdpg = PHYS_TO_VM_PAGE(*pmap_pml2e(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdpg, free); } if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { /* We just released a PD, unhold the matching PDP */ vm_page_t pdppg; pdppg = PHYS_TO_VM_PAGE(*pmap_pml1e(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdppg, free); } /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pml3_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, va, mpte, free)); } void mmu_radix_release(pmap_t pmap) { CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_radix), ("pmap_release: pmap has reserved page table page(s)")); pmap_invalidate_all(pmap); isa3_proctab[pmap->pm_pid].proctab0 = 0; uma_zfree(zone_radix_pgd, pmap->pm_pml1); vmem_free(asid_arena, pmap->pm_pid, 1); } /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t pde, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = pde & PG_PS_FRAME; CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); pvh->pv_gen++; return (true); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } static boolean_t pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va) { struct rwlock *lock; boolean_t rv; lock = NULL; rv = pmap_demote_l3e_locked(pmap, pde, va, &lock); if (lock != NULL) rw_wunlock(lock); return (rv); } static boolean_t pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, struct rwlock **lockp) { pml3_entry_t oldpde; pt_entry_t *firstpte; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = *l3e; KASSERT((oldpde & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), ("pmap_demote_l3e: oldpde is missing RPTE_LEAF and/or PG_V %lx", oldpde)); if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_l3e: page table page for a wired mapping" " is missing")); /* * Invalidate the 2MB page mapping and return "failure" if the * mapping was never accessed or the allocation of the new * page table page fails. If the 2MB page mapping belongs to * the direct map region of the kernel's address space, then * the page allocation request specifies the highest possible * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is * normal. Page table pages are preallocated for every other * part of the kernel address space, so the direct map region * is the only part of the kernel address space that must be * handled here. */ if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, pmap_l3e_pindex(va), (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); sva = trunc_2mpage(va); pmap_remove_l3e(pmap, l3e, sva, &free, lockp); pmap_invalidate_l3e_page(pmap, sva, oldpde); vm_page_free_pages_toq(&free, true); CTR2(KTR_PMAP, "pmap_demote_l3e: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } if (va < VM_MAXUSER_ADDRESS) pmap_resident_count_inc(pmap, 1); } mptepa = VM_PAGE_TO_PHYS(mpte); firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); KASSERT((oldpde & PG_A) != 0, ("pmap_demote_l3e: oldpde is missing PG_A")); KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_l3e: oldpde is missing PG_M")); /* * If the page table page is new, initialize it. */ if (mpte->ref_count == 1) { mpte->ref_count = NPTEPG; pmap_fill_ptp(firstpte, oldpde); } KASSERT((*firstpte & PG_FRAME) == (oldpde & PG_FRAME), ("pmap_demote_l3e: firstpte and newpte map different physical" " addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (oldpde & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, oldpde); /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the PDE and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_l3e() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldpde & PG_MANAGED) != 0) reserve_pv_entries(pmap, NPTEPG - 1, lockp); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ pde_store(l3e, mptepa); ptesync(); /* * Demote the PV entry. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_l3e(pmap, va, oldpde & PG_PS_FRAME, lockp); atomic_add_long(&pmap_l3e_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_l3e: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * pmap_remove_kernel_pde: Remove a kernel superpage mapping. */ static void pmap_remove_kernel_l3e(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va) { vm_paddr_t mptepa; vm_page_t mpte; KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); mptepa = VM_PAGE_TO_PHYS(mpte); /* * Initialize the page table page. */ pagezero(PHYS_TO_DMAP(mptepa)); /* * Demote the mapping. */ pde_store(l3e, mptepa); ptesync(); } /* * pmap_remove_l3e: do the things to unmap a superpage in a process */ static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pml3_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L3_PAGE_MASK) == 0, ("pmap_remove_l3e: sva is not 2mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= (L3_PAGE_SIZE / PAGE_SIZE); pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); if (oldpde & PG_MANAGED) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + L3_PAGE_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_l3e(pmap, pdq, sva); } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { pmap_resident_count_dec(pmap, 1); KASSERT(mpte->ref_count == NPTEPG, ("pmap_remove_l3e: pte page wire count error")); mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, *pmap_pml2e(pmap, sva), free)); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t oldpte; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); if (oldpte & RPTE_WIRED) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (oldpte & RPTE_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } return (pmap_unuse_pt(pmap, va, ptepde, free)); } /* * Remove a single page from a process address space */ static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *l3e, struct spglist *free) { struct rwlock *lock; pt_entry_t *pte; bool invalidate_all; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((*l3e & RPTE_VALID) == 0) { return (false); } pte = pmap_l3e_to_pte(l3e, va); if ((*pte & RPTE_VALID) == 0) { return (false); } lock = NULL; invalidate_all = pmap_remove_pte(pmap, pte, va, *l3e, free, &lock); if (lock != NULL) rw_wunlock(lock); if (!invalidate_all) pmap_invalidate_page(pmap, va); return (invalidate_all); } /* * Removes the specified range of addresses from the page table page. */ static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp) { pt_entry_t *pte; vm_offset_t va; bool anyvalid; PMAP_LOCK_ASSERT(pmap, MA_OWNED); anyvalid = false; va = eva; for (pte = pmap_l3e_to_pte(l3e, sva); sva != eva; pte++, sva += PAGE_SIZE) { MPASS(pte == pmap_pte(pmap, sva)); if (*pte == 0) { if (va != eva) { anyvalid = true; va = eva; } continue; } if (va == eva) va = sva; if (pmap_remove_pte(pmap, pte, sva, *l3e, free, lockp)) { anyvalid = true; sva += PAGE_SIZE; break; } } if (anyvalid) pmap_invalidate_all(pmap); else if (va != eva) pmap_invalidate_range(pmap, va, sva); return (anyvalid); } void mmu_radix_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va_next; pml1_entry_t *l1e; pml2_entry_t *l2e; pml3_entry_t ptpaddr, *l3e; struct spglist free; bool anyvalid; CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva); /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = false; SLIST_INIT(&free); /* XXX something fishy here */ sva = (sva + PAGE_MASK) & ~PAGE_MASK; eva = (eva + PAGE_MASK) & ~PAGE_MASK; PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { l3e = pmap_pml3e(pmap, sva); if (l3e && (*l3e & RPTE_LEAF) == 0) { anyvalid = pmap_remove_page(pmap, sva, l3e, &free); goto out; } } lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; l1e = pmap_pml1e(pmap, sva); if (l1e == NULL || (*l1e & PG_V) == 0) { va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; if (va_next < sva) va_next = eva; continue; } l2e = pmap_l1e_to_l2e(l1e, sva); if (l2e == NULL || (*l2e & PG_V) == 0) { va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; if (va_next < sva) va_next = eva; l3e = pmap_l2e_to_l3e(l2e, sva); ptpaddr = *l3e; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & RPTE_LEAF) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { pmap_remove_l3e(pmap, l3e, sva, &free, &lock); continue; } else if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) { /* The large page mapping was destroyed. */ continue; } else ptpaddr = *l3e; } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; if (pmap_remove_ptes(pmap, sva, va_next, l3e, &free, &lock)) anyvalid = true; } if (lock != NULL) rw_wunlock(lock); out: if (anyvalid) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } void mmu_radix_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pt_entry_t *pte, tpte; pml3_entry_t *l3e; vm_offset_t va; struct spglist free; int pvh_gen, md_gen; CTR2(KTR_PMAP, "%s(%p)", __func__, m); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry: rw_wlock(lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; l3e = pmap_pml3e(pmap, va); (void)pmap_demote_l3e_locked(pmap, l3e, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } pmap_resident_count_dec(pmap, 1); l3e = pmap_pml3e(pmap, pv->pv_va); KASSERT((*l3e & RPTE_LEAF) == 0, ("pmap_remove_all: found" " a 2mpage in page %p's pv list", m)); pte = pmap_l3e_to_pte(l3e, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, *l3e, &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); vm_page_free_pages_toq(&free, true); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. * * Although this function destroys all of the pmap's managed, * non-wired mappings, it can delay and batch the invalidation of TLB * entries without calling pmap_delayed_invl_started() and * pmap_delayed_invl_finished(). Because the pmap is not active on * any other processor, none of these TLB entries will ever be used * before their eventual invalidation. Consequently, there is no need * for either pmap_remove_all() or pmap_remove_write() to wait for * that eventual TLB invalidation. */ void mmu_radix_remove_pages(pmap_t pmap) { CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); pml3_entry_t ptel3e; pt_entry_t *pte, tpte; struct spglist free; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx; boolean_t superpage; vm_paddr_t pa; /* * Assert that the given pmap is only active on the current * CPU. Unfortunately, we cannot block another CPU from * activating the pmap while this function is executing. */ KASSERT(pmap->pm_pid == mfspr(SPR_PID), ("non-current asid %lu - expected %lu", pmap->pm_pid, mfspr(SPR_PID))); lock = NULL; SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = cnttzd(inuse); bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pml2e(pmap, pv->pv_va); ptel3e = *pte; pte = pmap_l2e_to_l3e(pte, pv->pv_va); tpte = *pte; if ((tpte & (RPTE_LEAF | PG_V)) == PG_V) { superpage = FALSE; ptel3e = tpte; pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & PG_FRAME); pte = &pte[pmap_pte_index(pv->pv_va)]; tpte = *pte; } else { /* * Keep track whether 'tpte' is a * superpage explicitly instead of * relying on RPTE_LEAF being set. * * This is because RPTE_LEAF is numerically * identical to PG_PTE_PAT and thus a * regular page could be mistaken for * a superpage. */ superpage = TRUE; } if ((tpte & PG_V) == 0) { panic("bad pte va %lx pte %lx", pv->pv_va, tpte); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } if (superpage) pa = tpte & PG_PS_FRAME; else pa = tpte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (superpage) { for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; if (superpage) { pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) if ((mt->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { pmap_resident_count_dec(pmap, 1); KASSERT(mpte->ref_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); } } else { pmap_resident_count_dec(pmap, 1); #ifdef VERBOSE_PV printf("freeing pv (%p, %p)\n", pmap, pv); #endif TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); m->md.pv_gen++; if ((m->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } pmap_unuse_pt(pmap, pv->pv_va, ptel3e, &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } void mmu_radix_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pml3_entry_t *l3e; pt_entry_t oldpte, *pte; int pvh_gen, md_gen; CTR2(KTR_PMAP, "%s(%p)", __func__, m); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } l3e = pmap_pml3e(pmap, pv->pv_va); if ((*l3e & PG_RW) != 0) (void)pmap_demote_l3e_locked(pmap, l3e, pv->pv_va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } l3e = pmap_pml3e(pmap, pv->pv_va); KASSERT((*l3e & RPTE_LEAF) == 0, ("pmap_remove_write: found a 2mpage in page %p's pv list", m)); pte = pmap_l3e_to_pte(l3e, pv->pv_va); retry: oldpte = *pte; if (oldpte & PG_RW) { if (!atomic_cmpset_long(pte, oldpte, (oldpte | RPTE_EAA_R) & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware * feature, so there is no need to invalidate any TLB entries. * Since pmap_demote_l3e() for the wired entry must never fail, * pmap_delayed_invl_started()/finished() calls around the * function are not needed. */ void mmu_radix_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pml1_entry_t *l1e; pml2_entry_t *l2e; pml3_entry_t *l3e; pt_entry_t *pte; CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1e = pmap_pml1e(pmap, sva); if ((*l1e & PG_V) == 0) { va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; if (va_next < sva) va_next = eva; continue; } l2e = pmap_l1e_to_l2e(l1e, sva); if ((*l2e & PG_V) == 0) { va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; if (va_next < sva) va_next = eva; l3e = pmap_l2e_to_l3e(l2e, sva); if ((*l3e & PG_V) == 0) continue; if ((*l3e & RPTE_LEAF) != 0) { if ((*l3e & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*l3e); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { atomic_clear_long(l3e, PG_W); pmap->pm_stats.wired_count -= L3_PAGE_SIZE / PAGE_SIZE; continue; } else if (!pmap_demote_l3e(pmap, l3e, sva)) panic("pmap_unwire: demotion failed"); } if (va_next > eva) va_next = eva; for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, sva += PAGE_SIZE) { MPASS(pte == pmap_pte(pmap, sva)); if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ atomic_clear_long(pte, PG_W); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } void mmu_radix_zero_page(vm_page_t m) { vm_offset_t addr; CTR2(KTR_PMAP, "%s(%p)", __func__, m); addr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero(addr); } void mmu_radix_zero_page_area(vm_page_t m, int off, int size) { caddr_t addr; CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size); MPASS(off + size <= PAGE_SIZE); addr = (caddr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); memset(addr + off, 0, size); } static int mmu_radix_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pml3_entry_t *l3ep; pt_entry_t pte; vm_paddr_t pa; int val; CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); PMAP_LOCK(pmap); l3ep = pmap_pml3e(pmap, addr); if (l3ep != NULL && (*l3ep & PG_V)) { if (*l3ep & RPTE_LEAF) { pte = *l3ep; /* Compute the physical address of the 4KB page. */ pa = ((*l3ep & PG_PS_FRAME) | (addr & L3_PAGE_MASK)) & PG_FRAME; val = MINCORE_PSIND(1); } else { pte = *pmap_l3e_to_pte(l3ep, addr); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { *locked_pa = pa; } PMAP_UNLOCK(pmap); return (val); } void mmu_radix_activate(struct thread *td) { pmap_t pmap; uint32_t curpid; CTR2(KTR_PMAP, "%s(%p)", __func__, td); critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); curpid = mfspr(SPR_PID); if (pmap->pm_pid > isa3_base_pid && curpid != pmap->pm_pid) { mmu_radix_pid_set(pmap); } critical_exit(); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void mmu_radix_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr, size); vm_offset_t superpage_offset; if (size < L3_PAGE_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & L3_PAGE_MASK; if (size - ((L3_PAGE_SIZE - superpage_offset) & L3_PAGE_MASK) < L3_PAGE_SIZE || (*addr & L3_PAGE_MASK) == superpage_offset) return; if ((*addr & L3_PAGE_MASK) < superpage_offset) *addr = (*addr & ~L3_PAGE_MASK) + superpage_offset; else *addr = ((*addr + L3_PAGE_MASK) & ~L3_PAGE_MASK) + superpage_offset; } static void * mmu_radix_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t attr) { vm_offset_t va, tmpva, ppa, offset; ppa = trunc_page(pa); offset = pa & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); if (pa < powerpc_ptob(Maxmem)) panic("bad pa: %#lx less than Maxmem %#lx\n", pa, powerpc_ptob(Maxmem)); va = kva_alloc(size); if (bootverbose) printf("%s(%#lx, %lu, %d)\n", __func__, pa, size, attr); KASSERT(size > 0, ("%s(%#lx, %lu, %d)", __func__, pa, size, attr)); if (!va) panic("%s: Couldn't alloc kernel virtual memory", __func__); for (tmpva = va; size > 0;) { mmu_radix_kenter_attr(tmpva, ppa, attr); size -= PAGE_SIZE; tmpva += PAGE_SIZE; ppa += PAGE_SIZE; } ptesync(); return ((void *)(va + offset)); } static void * mmu_radix_mapdev(vm_paddr_t pa, vm_size_t size) { CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); return (mmu_radix_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT)); } void mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma) { CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma); m->md.mdpg_cache_attrs = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && mmu_radix_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.mdpg_cache_attrs)) panic("memory attribute change on the direct map failed"); } static void mmu_radix_unmapdev(vm_offset_t va, vm_size_t size) { vm_offset_t offset; CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, va, size); /* If we gave a direct map region in pmap_mapdev, do nothing */ if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); if (pmap_initialized) { mmu_radix_qremove(va, atop(size)); kva_free(va, size); } } static __inline void pmap_pte_attr(pt_entry_t *pte, uint64_t cache_bits, uint64_t mask) { uint64_t opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *pte; npte = opte & ~mask; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_long(pte, opte, npte)); } /* * Tries to demote a 1GB page mapping. */ static boolean_t pmap_demote_l2e(pmap_t pmap, pml2_entry_t *l2e, vm_offset_t va) { pml2_entry_t oldpdpe; pml3_entry_t *firstpde, newpde, *pde; vm_paddr_t pdpgpa; vm_page_t pdpg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpdpe = *l2e; KASSERT((oldpdpe & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); pdpg = vm_page_alloc(NULL, va >> L2_PAGE_SIZE_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (pdpg == NULL) { CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } pdpgpa = VM_PAGE_TO_PHYS(pdpg); firstpde = (pml3_entry_t *)PHYS_TO_DMAP(pdpgpa); KASSERT((oldpdpe & PG_A) != 0, ("pmap_demote_pdpe: oldpdpe is missing PG_A")); KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pdpe: oldpdpe is missing PG_M")); newpde = oldpdpe; /* * Initialize the page directory page. */ for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { *pde = newpde; newpde += L3_PAGE_SIZE; } /* * Demote the mapping. */ pde_store(l2e, pdpgpa); /* * Flush PWC --- XXX revisit */ pmap_invalidate_all(pmap); pmap_l2e_demotions++; CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } vm_paddr_t mmu_radix_kextract(vm_offset_t va) { pml3_entry_t l3e; vm_paddr_t pa; CTR2(KTR_PMAP, "%s(%#x)", __func__, va); if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { l3e = *pmap_pml3e(kernel_pmap, va); if (l3e & RPTE_LEAF) { pa = (l3e & PG_PS_FRAME) | (va & L3_PAGE_MASK); pa |= (va & L3_PAGE_MASK); } else { /* * Beware of a concurrent promotion that changes the * PDE at this point! For example, vtopte() must not * be used to access the PTE because it would use the * new PDE. It is, however, safe to use the old PDE * because the page table page is preserved by the * promotion. */ pa = *pmap_l3e_to_pte(&l3e, va); pa = (pa & PG_FRAME) | (va & PAGE_MASK); pa |= (va & PAGE_MASK); } } return (pa); } static pt_entry_t mmu_radix_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) { if (ma != VM_MEMATTR_DEFAULT) { return pmap_cache_bits(ma); } /* * Assume the page is cache inhibited and access is guarded unless * it's in our available memory array. */ for (int i = 0; i < pregions_sz; i++) { if ((pa >= pregions[i].mr_start) && (pa < (pregions[i].mr_start + pregions[i].mr_size))) return (RPTE_ATTR_MEM); } return (RPTE_ATTR_GUARDEDIO); } static void mmu_radix_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { pt_entry_t *pte, pteval; uint64_t cache_bits; pte = kvtopte(va); MPASS(pte != NULL); pteval = pa | RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; cache_bits = mmu_radix_calc_wimg(pa, ma); pte_store(pte, pteval | cache_bits); } void mmu_radix_kremove(vm_offset_t va) { pt_entry_t *pte; CTR2(KTR_PMAP, "%s(%#x)", __func__, va); pte = kvtopte(va); pte_clear(pte); } int mmu_radix_decode_kernel_ptr(vm_offset_t addr, int *is_user, vm_offset_t *decoded) { CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr); *decoded = addr; *is_user = (addr < VM_MAXUSER_ADDRESS); return (0); } static boolean_t mmu_radix_dev_direct_mapped(vm_paddr_t pa, vm_size_t size) { CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); return (mem_valid(pa, size)); } static void mmu_radix_scan_init() { CTR1(KTR_PMAP, "%s()", __func__); UNIMPLEMENTED(); } static void mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, void **va) { CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va); UNIMPLEMENTED(); } vm_offset_t mmu_radix_quick_enter_page(vm_page_t m) { vm_paddr_t paddr; CTR2(KTR_PMAP, "%s(%p)", __func__, m); paddr = VM_PAGE_TO_PHYS(m); return (PHYS_TO_DMAP(paddr)); } void mmu_radix_quick_remove_page(vm_offset_t addr __unused) { /* no work to do here */ CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); } static void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) { cpu_flush_dcache((void *)sva, eva - sva); } int mmu_radix_change_attr(vm_offset_t va, vm_size_t size, vm_memattr_t mode) { int error; CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, va, size, mode); PMAP_LOCK(kernel_pmap); error = pmap_change_attr_locked(va, size, mode, true); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush) { vm_offset_t base, offset, tmpva; vm_paddr_t pa_start, pa_end, pa_end1; pml2_entry_t *l2e; pml3_entry_t *l3e; pt_entry_t *pte; int cache_bits, error; boolean_t changed; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses, including the direct * map but excluding the recursive map. */ if (base < DMAP_MIN_ADDRESS) return (EINVAL); cache_bits = pmap_cache_bits(mode); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down 2MB pages * into 4KB pages if required. */ for (tmpva = base; tmpva < base + size; ) { l2e = pmap_pml2e(kernel_pmap, tmpva); if (l2e == NULL || *l2e == 0) return (EINVAL); if (*l2e & RPTE_LEAF) { /* * If the current 1GB page already has the required * memory type, then we need not demote this page. Just * increment tmpva to the next 1GB page frame. */ if ((*l2e & RPTE_ATTR_MASK) == cache_bits) { tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; continue; } /* * If the current offset aligns with a 1GB page frame * and there is at least 1GB left within the range, then * we need not break down this page into 2MB pages. */ if ((tmpva & L2_PAGE_MASK) == 0 && tmpva + L2_PAGE_MASK < base + size) { tmpva += L2_PAGE_MASK; continue; } if (!pmap_demote_l2e(kernel_pmap, l2e, tmpva)) return (ENOMEM); } l3e = pmap_l2e_to_l3e(l2e, tmpva); KASSERT(l3e != NULL, ("no l3e entry for %#lx in %p\n", tmpva, l2e)); if (*l3e == 0) return (EINVAL); if (*l3e & RPTE_LEAF) { /* * If the current 2MB page already has the required * memory type, then we need not demote this page. Just * increment tmpva to the next 2MB page frame. */ if ((*l3e & RPTE_ATTR_MASK) == cache_bits) { tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; continue; } /* * If the current offset aligns with a 2MB page frame * and there is at least 2MB left within the range, then * we need not break down this page into 4KB pages. */ if ((tmpva & L3_PAGE_MASK) == 0 && tmpva + L3_PAGE_MASK < base + size) { tmpva += L3_PAGE_SIZE; continue; } if (!pmap_demote_l3e(kernel_pmap, l3e, tmpva)) return (ENOMEM); } pte = pmap_l3e_to_pte(l3e, tmpva); if (*pte == 0) return (EINVAL); tmpva += PAGE_SIZE; } error = 0; /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ pa_start = pa_end = 0; for (tmpva = base; tmpva < base + size; ) { l2e = pmap_pml2e(kernel_pmap, tmpva); if (*l2e & RPTE_LEAF) { if ((*l2e & RPTE_ATTR_MASK) != cache_bits) { pmap_pte_attr(l2e, cache_bits, RPTE_ATTR_MASK); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*l2e & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *l2e & PG_PS_FRAME; pa_end = pa_start + L2_PAGE_SIZE; } else if (pa_end == (*l2e & PG_PS_FRAME)) pa_end += L2_PAGE_SIZE; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode, flush); if (error != 0) break; /* Start physical address run. */ pa_start = *l2e & PG_PS_FRAME; pa_end = pa_start + L2_PAGE_SIZE; } } tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; continue; } l3e = pmap_l2e_to_l3e(l2e, tmpva); if (*l3e & RPTE_LEAF) { if ((*l3e & RPTE_ATTR_MASK) != cache_bits) { pmap_pte_attr(l3e, cache_bits, RPTE_ATTR_MASK); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*l3e & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *l3e & PG_PS_FRAME; pa_end = pa_start + L3_PAGE_SIZE; } else if (pa_end == (*l3e & PG_PS_FRAME)) pa_end += L3_PAGE_SIZE; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode, flush); if (error != 0) break; /* Start physical address run. */ pa_start = *l3e & PG_PS_FRAME; pa_end = pa_start + L3_PAGE_SIZE; } } tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; } else { pte = pmap_l3e_to_pte(l3e, tmpva); if ((*pte & RPTE_ATTR_MASK) != cache_bits) { pmap_pte_attr(pte, cache_bits, RPTE_ATTR_MASK); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pte & PG_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pte & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } else if (pa_end == (*pte & PG_FRAME)) pa_end += PAGE_SIZE; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode, flush); if (error != 0) break; /* Start physical address run. */ pa_start = *pte & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } } tmpva += PAGE_SIZE; } } if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { pa_end1 = MIN(pa_end, dmaplimit); if (pa_start != pa_end1) error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), pa_end1 - pa_start, mode, flush); } /* * Flush CPU caches if required to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_all(kernel_pmap); if (flush) pmap_invalidate_cache_range(base, tmpva); } return (error); } /* * Allocate physical memory for the vm_page array and map it into KVA, * attempting to back the vm_pages with domain-local memory. */ void mmu_radix_page_array_startup(long pages) { #ifdef notyet pml2_entry_t *l2e; pml3_entry_t *pde; pml3_entry_t newl3; vm_offset_t va; long pfn; int domain, i; #endif vm_paddr_t pa; vm_offset_t start, end; vm_page_array_size = pages; start = VM_MIN_KERNEL_ADDRESS; end = start + pages * sizeof(struct vm_page); pa = vm_phys_early_alloc(0, end - start); start = mmu_radix_map(&start, pa, end - start, VM_MEMATTR_DEFAULT); #ifdef notyet /* TODO: NUMA vm_page_array. Blocked out until then (copied from amd64). */ for (va = start; va < end; va += L3_PAGE_SIZE) { pfn = first_page + (va - start) / sizeof(struct vm_page); domain = _vm_phys_domain(ptoa(pfn)); l2e = pmap_pml2e(kernel_pmap, va); if ((*l2e & PG_V) == 0) { pa = vm_phys_early_alloc(domain, PAGE_SIZE); dump_add_page(pa); pagezero(PHYS_TO_DMAP(pa)); pde_store(l2e, (pml2_entry_t)pa); } pde = pmap_l2e_to_l3e(l2e, va); if ((*pde & PG_V) != 0) panic("Unexpected pde %p", pde); pa = vm_phys_early_alloc(domain, L3_PAGE_SIZE); for (i = 0; i < NPDEPG; i++) dump_add_page(pa + i * PAGE_SIZE); newl3 = (pml3_entry_t)(pa | RPTE_EAA_P | RPTE_EAA_R | RPTE_EAA_W); pte_store(pde, newl3); } #endif vm_page_array = (vm_page_t)start; } #ifdef DDB #include #include static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va) { pml1_entry_t *l1e; pml2_entry_t *l2e; pml3_entry_t *l3e; pt_entry_t *pte; l1e = &l1[pmap_pml1e_index(va)]; db_printf("VA %#016lx l1e %#016lx", va, *l1e); if ((*l1e & PG_V) == 0) { db_printf("\n"); return; } l2e = pmap_l1e_to_l2e(l1e, va); db_printf(" l2e %#016lx", *l2e); if ((*l2e & PG_V) == 0 || (*l2e & RPTE_LEAF) != 0) { db_printf("\n"); return; } l3e = pmap_l2e_to_l3e(l2e, va); db_printf(" l3e %#016lx", *l3e); if ((*l3e & PG_V) == 0 || (*l3e & RPTE_LEAF) != 0) { db_printf("\n"); return; } pte = pmap_l3e_to_pte(l3e, va); db_printf(" pte %#016lx\n", *pte); } void pmap_page_print_mappings(vm_page_t m) { pmap_t pmap; pv_entry_t pv; db_printf("page %p(%lx)\n", m, m->phys_addr); /* need to elide locks if running in ddb */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { db_printf("pv: %p ", pv); db_printf("va: %#016lx ", pv->pv_va); pmap = PV_PMAP(pv); db_printf("pmap %p ", pmap); if (pmap != NULL) { db_printf("asid: %lu\n", pmap->pm_pid); pmap_pte_walk(pmap->pm_pml1, pv->pv_va); } } } DB_SHOW_COMMAND(pte, pmap_print_pte) { vm_offset_t va; pmap_t pmap; if (!have_addr) { db_printf("show pte addr\n"); return; } va = (vm_offset_t)addr; if (va >= DMAP_MIN_ADDRESS) pmap = kernel_pmap; else if (kdb_thread != NULL) pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); else pmap = vmspace_pmap(curthread->td_proc->p_vmspace); pmap_pte_walk(pmap->pm_pml1, va); } #endif Index: head/sys/powerpc/booke/pmap.c =================================================================== --- head/sys/powerpc/booke/pmap.c (revision 366710) +++ head/sys/powerpc/booke/pmap.c (revision 366711) @@ -1,3107 +1,3108 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2007-2009 Semihalf, Rafal Jaworowski * Copyright (C) 2006 Semihalf, Marian Balakowicz * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Some hw specific parts of this pmap were derived or influenced * by NetBSD's ibm4xx pmap module. More generic code is shared with * a few other pmap modules from the FreeBSD tree. */ /* * VM layout notes: * * Kernel and user threads run within one common virtual address space * defined by AS=0. * * 32-bit pmap: * Virtual address space layout: * ----------------------------- * 0x0000_0000 - 0x7fff_ffff : user process * 0x8000_0000 - 0xbfff_ffff : pmap_mapdev()-ed area (PCI/PCIE etc.) * 0xc000_0000 - 0xc0ff_ffff : kernel reserved * 0xc000_0000 - data_end : kernel code+data, env, metadata etc. * 0xc100_0000 - 0xffff_ffff : KVA * 0xc100_0000 - 0xc100_3fff : reserved for page zero/copy * 0xc100_4000 - 0xc200_3fff : reserved for ptbl bufs * 0xc200_4000 - 0xc200_8fff : guard page + kstack0 * 0xc200_9000 - 0xfeef_ffff : actual free KVA space * * 64-bit pmap: * Virtual address space layout: * ----------------------------- * 0x0000_0000_0000_0000 - 0xbfff_ffff_ffff_ffff : user process * 0x0000_0000_0000_0000 - 0x8fff_ffff_ffff_ffff : text, data, heap, maps, libraries * 0x9000_0000_0000_0000 - 0xafff_ffff_ffff_ffff : mmio region * 0xb000_0000_0000_0000 - 0xbfff_ffff_ffff_ffff : stack * 0xc000_0000_0000_0000 - 0xcfff_ffff_ffff_ffff : kernel reserved * 0xc000_0000_0000_0000 - endkernel-1 : kernel code & data * endkernel - msgbufp-1 : flat device tree * msgbufp - kernel_pdir-1 : message buffer * kernel_pdir - kernel_pp2d-1 : kernel page directory * kernel_pp2d - . : kernel pointers to page directory * pmap_zero_copy_min - crashdumpmap-1 : reserved for page zero/copy * crashdumpmap - ptbl_buf_pool_vabase-1 : reserved for ptbl bufs * ptbl_buf_pool_vabase - virtual_avail-1 : user page directories and page tables * virtual_avail - 0xcfff_ffff_ffff_ffff : actual free KVA space * 0xd000_0000_0000_0000 - 0xdfff_ffff_ffff_ffff : coprocessor region * 0xe000_0000_0000_0000 - 0xefff_ffff_ffff_ffff : mmio region * 0xf000_0000_0000_0000 - 0xffff_ffff_ffff_ffff : direct map * 0xf000_0000_0000_0000 - +Maxmem : physmem map * - 0xffff_ffff_ffff_ffff : device direct map */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include -#include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #define SPARSE_MAPDEV /* Use power-of-two mappings in mmu_booke_mapdev(), to save entries. */ #define POW2_MAPPINGS #ifdef DEBUG #define debugf(fmt, args...) printf(fmt, ##args) #else #define debugf(fmt, args...) #endif #ifdef __powerpc64__ #define PRI0ptrX "016lx" #else #define PRI0ptrX "08x" #endif #define TODO panic("%s: not implemented", __func__); extern unsigned char _etext[]; extern unsigned char _end[]; extern uint32_t *bootinfo; vm_paddr_t kernload; vm_offset_t kernstart; vm_size_t kernsize; /* Message buffer and tables. */ static vm_offset_t data_start; static vm_size_t data_end; /* Phys/avail memory regions. */ static struct mem_region *availmem_regions; static int availmem_regions_sz; static struct mem_region *physmem_regions; static int physmem_regions_sz; #ifndef __powerpc64__ /* Reserved KVA space and mutex for mmu_booke_zero_page. */ static vm_offset_t zero_page_va; static struct mtx zero_page_mutex; /* Reserved KVA space and mutex for mmu_booke_copy_page. */ static vm_offset_t copy_page_src_va; static vm_offset_t copy_page_dst_va; static struct mtx copy_page_mutex; #endif static struct mtx tlbivax_mutex; /**************************************************************************/ /* PMAP */ /**************************************************************************/ static int mmu_booke_enter_locked(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int flags, int8_t psind); unsigned int kptbl_min; /* Index of the first kernel ptbl. */ static uma_zone_t ptbl_root_zone; /* * If user pmap is processed with mmu_booke_remove and the resident count * drops to 0, there are no more pages to remove, so we need not continue. */ #define PMAP_REMOVE_DONE(pmap) \ ((pmap) != kernel_pmap && (pmap)->pm_stats.resident_count == 0) #if defined(COMPAT_FREEBSD32) || !defined(__powerpc64__) extern int elf32_nxstack; #endif /**************************************************************************/ /* TLB and TID handling */ /**************************************************************************/ /* Translation ID busy table */ static volatile pmap_t tidbusy[MAXCPU][TID_MAX + 1]; /* * TLB0 capabilities (entry, way numbers etc.). These can vary between e500 * core revisions and should be read from h/w registers during early config. */ uint32_t tlb0_entries; uint32_t tlb0_ways; uint32_t tlb0_entries_per_way; uint32_t tlb1_entries; #define TLB0_ENTRIES (tlb0_entries) #define TLB0_WAYS (tlb0_ways) #define TLB0_ENTRIES_PER_WAY (tlb0_entries_per_way) #define TLB1_ENTRIES (tlb1_entries) static tlbtid_t tid_alloc(struct pmap *); #ifdef DDB #ifdef __powerpc64__ static void tlb_print_entry(int, uint32_t, uint64_t, uint32_t, uint32_t); #else static void tlb_print_entry(int, uint32_t, uint32_t, uint32_t, uint32_t); #endif #endif static void tlb1_read_entry(tlb_entry_t *, unsigned int); static void tlb1_write_entry(tlb_entry_t *, unsigned int); static int tlb1_iomapped(int, vm_paddr_t, vm_size_t, vm_offset_t *); static vm_size_t tlb1_mapin_region(vm_offset_t, vm_paddr_t, vm_size_t, int); static __inline uint32_t tlb_calc_wimg(vm_paddr_t pa, vm_memattr_t ma); static vm_size_t tsize2size(unsigned int); static unsigned int size2tsize(vm_size_t); static unsigned long ilog2(unsigned long); static void set_mas4_defaults(void); static inline void tlb0_flush_entry(vm_offset_t); static inline unsigned int tlb0_tableidx(vm_offset_t, unsigned int); /**************************************************************************/ /* Page table management */ /**************************************************************************/ static struct rwlock_padalign pvh_global_lock; /* Data for the pv entry allocation mechanism */ static uma_zone_t pvzone; static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; #define PV_ENTRY_ZONE_MIN 2048 /* min pv entries in uma zone */ #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif static vm_paddr_t pte_vatopa(pmap_t, vm_offset_t); static int pte_enter(pmap_t, vm_page_t, vm_offset_t, uint32_t, boolean_t); static int pte_remove(pmap_t, vm_offset_t, uint8_t); static pte_t *pte_find(pmap_t, vm_offset_t); static void kernel_pte_alloc(vm_offset_t, vm_offset_t); static pv_entry_t pv_alloc(void); static void pv_free(pv_entry_t); static void pv_insert(pmap_t, vm_offset_t, vm_page_t); static void pv_remove(pmap_t, vm_offset_t, vm_page_t); static void booke_pmap_init_qpages(void); static inline void tlb_miss_lock(void); static inline void tlb_miss_unlock(void); #ifdef SMP extern tlb_entry_t __boot_tlb1[]; void pmap_bootstrap_ap(volatile uint32_t *); #endif /* * Kernel MMU interface */ static void mmu_booke_clear_modify(vm_page_t); static void mmu_booke_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); static void mmu_booke_copy_page(vm_page_t, vm_page_t); static void mmu_booke_copy_pages(vm_page_t *, vm_offset_t, vm_page_t *, vm_offset_t, int); static int mmu_booke_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int flags, int8_t psind); static void mmu_booke_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); static void mmu_booke_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t); static vm_paddr_t mmu_booke_extract(pmap_t, vm_offset_t); static vm_page_t mmu_booke_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t); static void mmu_booke_init(void); static boolean_t mmu_booke_is_modified(vm_page_t); static boolean_t mmu_booke_is_prefaultable(pmap_t, vm_offset_t); static boolean_t mmu_booke_is_referenced(vm_page_t); static int mmu_booke_ts_referenced(vm_page_t); static vm_offset_t mmu_booke_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int); static int mmu_booke_mincore(pmap_t, vm_offset_t, vm_paddr_t *); static void mmu_booke_object_init_pt(pmap_t, vm_offset_t, vm_object_t, vm_pindex_t, vm_size_t); static boolean_t mmu_booke_page_exists_quick(pmap_t, vm_page_t); static void mmu_booke_page_init(vm_page_t); static int mmu_booke_page_wired_mappings(vm_page_t); static int mmu_booke_pinit(pmap_t); static void mmu_booke_pinit0(pmap_t); static void mmu_booke_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); static void mmu_booke_qenter(vm_offset_t, vm_page_t *, int); static void mmu_booke_qremove(vm_offset_t, int); static void mmu_booke_release(pmap_t); static void mmu_booke_remove(pmap_t, vm_offset_t, vm_offset_t); static void mmu_booke_remove_all(vm_page_t); static void mmu_booke_remove_write(vm_page_t); static void mmu_booke_unwire(pmap_t, vm_offset_t, vm_offset_t); static void mmu_booke_zero_page(vm_page_t); static void mmu_booke_zero_page_area(vm_page_t, int, int); static void mmu_booke_activate(struct thread *); static void mmu_booke_deactivate(struct thread *); static void mmu_booke_bootstrap(vm_offset_t, vm_offset_t); static void *mmu_booke_mapdev(vm_paddr_t, vm_size_t); static void *mmu_booke_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t); static void mmu_booke_unmapdev(vm_offset_t, vm_size_t); static vm_paddr_t mmu_booke_kextract(vm_offset_t); static void mmu_booke_kenter(vm_offset_t, vm_paddr_t); static void mmu_booke_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t); static void mmu_booke_kremove(vm_offset_t); static boolean_t mmu_booke_dev_direct_mapped(vm_paddr_t, vm_size_t); static void mmu_booke_sync_icache(pmap_t, vm_offset_t, vm_size_t); static void mmu_booke_dumpsys_map(vm_paddr_t pa, size_t, void **); static void mmu_booke_dumpsys_unmap(vm_paddr_t pa, size_t, void *); static void mmu_booke_scan_init(void); static vm_offset_t mmu_booke_quick_enter_page(vm_page_t m); static void mmu_booke_quick_remove_page(vm_offset_t addr); static int mmu_booke_change_attr(vm_offset_t addr, vm_size_t sz, vm_memattr_t mode); static int mmu_booke_decode_kernel_ptr(vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr); static void mmu_booke_page_array_startup(long); static boolean_t mmu_booke_page_is_mapped(vm_page_t m); static struct pmap_funcs mmu_booke_methods = { /* pmap dispatcher interface */ .clear_modify = mmu_booke_clear_modify, .copy = mmu_booke_copy, .copy_page = mmu_booke_copy_page, .copy_pages = mmu_booke_copy_pages, .enter = mmu_booke_enter, .enter_object = mmu_booke_enter_object, .enter_quick = mmu_booke_enter_quick, .extract = mmu_booke_extract, .extract_and_hold = mmu_booke_extract_and_hold, .init = mmu_booke_init, .is_modified = mmu_booke_is_modified, .is_prefaultable = mmu_booke_is_prefaultable, .is_referenced = mmu_booke_is_referenced, .ts_referenced = mmu_booke_ts_referenced, .map = mmu_booke_map, .mincore = mmu_booke_mincore, .object_init_pt = mmu_booke_object_init_pt, .page_exists_quick = mmu_booke_page_exists_quick, .page_init = mmu_booke_page_init, .page_wired_mappings = mmu_booke_page_wired_mappings, .pinit = mmu_booke_pinit, .pinit0 = mmu_booke_pinit0, .protect = mmu_booke_protect, .qenter = mmu_booke_qenter, .qremove = mmu_booke_qremove, .release = mmu_booke_release, .remove = mmu_booke_remove, .remove_all = mmu_booke_remove_all, .remove_write = mmu_booke_remove_write, .sync_icache = mmu_booke_sync_icache, .unwire = mmu_booke_unwire, .zero_page = mmu_booke_zero_page, .zero_page_area = mmu_booke_zero_page_area, .activate = mmu_booke_activate, .deactivate = mmu_booke_deactivate, .quick_enter_page = mmu_booke_quick_enter_page, .quick_remove_page = mmu_booke_quick_remove_page, .page_array_startup = mmu_booke_page_array_startup, .page_is_mapped = mmu_booke_page_is_mapped, /* Internal interfaces */ .bootstrap = mmu_booke_bootstrap, .dev_direct_mapped = mmu_booke_dev_direct_mapped, .mapdev = mmu_booke_mapdev, .mapdev_attr = mmu_booke_mapdev_attr, .kenter = mmu_booke_kenter, .kenter_attr = mmu_booke_kenter_attr, .kextract = mmu_booke_kextract, .kremove = mmu_booke_kremove, .unmapdev = mmu_booke_unmapdev, .change_attr = mmu_booke_change_attr, .decode_kernel_ptr = mmu_booke_decode_kernel_ptr, /* dumpsys() support */ .dumpsys_map_chunk = mmu_booke_dumpsys_map, .dumpsys_unmap_chunk = mmu_booke_dumpsys_unmap, .dumpsys_pa_init = mmu_booke_scan_init, }; MMU_DEF(booke_mmu, MMU_TYPE_BOOKE, mmu_booke_methods); #ifdef __powerpc64__ #include "pmap_64.c" #else #include "pmap_32.c" #endif static vm_offset_t tlb1_map_base = VM_MAPDEV_BASE; static __inline uint32_t tlb_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) { uint32_t attrib; int i; if (ma != VM_MEMATTR_DEFAULT) { switch (ma) { case VM_MEMATTR_UNCACHEABLE: return (MAS2_I | MAS2_G); case VM_MEMATTR_WRITE_COMBINING: case VM_MEMATTR_WRITE_BACK: case VM_MEMATTR_PREFETCHABLE: return (MAS2_I); case VM_MEMATTR_WRITE_THROUGH: return (MAS2_W | MAS2_M); case VM_MEMATTR_CACHEABLE: return (MAS2_M); } } /* * Assume the page is cache inhibited and access is guarded unless * it's in our available memory array. */ attrib = _TLB_ENTRY_IO; for (i = 0; i < physmem_regions_sz; i++) { if ((pa >= physmem_regions[i].mr_start) && (pa < (physmem_regions[i].mr_start + physmem_regions[i].mr_size))) { attrib = _TLB_ENTRY_MEM; break; } } return (attrib); } static inline void tlb_miss_lock(void) { #ifdef SMP struct pcpu *pc; if (!smp_started) return; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (pc != pcpup) { CTR3(KTR_PMAP, "%s: tlb miss LOCK of CPU=%d, " "tlb_lock=%p", __func__, pc->pc_cpuid, pc->pc_booke.tlb_lock); KASSERT((pc->pc_cpuid != PCPU_GET(cpuid)), ("tlb_miss_lock: tried to lock self")); tlb_lock(pc->pc_booke.tlb_lock); CTR1(KTR_PMAP, "%s: locked", __func__); } } #endif } static inline void tlb_miss_unlock(void) { #ifdef SMP struct pcpu *pc; if (!smp_started) return; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (pc != pcpup) { CTR2(KTR_PMAP, "%s: tlb miss UNLOCK of CPU=%d", __func__, pc->pc_cpuid); tlb_unlock(pc->pc_booke.tlb_lock); CTR1(KTR_PMAP, "%s: unlocked", __func__); } } #endif } /* Return number of entries in TLB0. */ static __inline void tlb0_get_tlbconf(void) { uint32_t tlb0_cfg; tlb0_cfg = mfspr(SPR_TLB0CFG); tlb0_entries = tlb0_cfg & TLBCFG_NENTRY_MASK; tlb0_ways = (tlb0_cfg & TLBCFG_ASSOC_MASK) >> TLBCFG_ASSOC_SHIFT; tlb0_entries_per_way = tlb0_entries / tlb0_ways; } /* Return number of entries in TLB1. */ static __inline void tlb1_get_tlbconf(void) { uint32_t tlb1_cfg; tlb1_cfg = mfspr(SPR_TLB1CFG); tlb1_entries = tlb1_cfg & TLBCFG_NENTRY_MASK; } /**************************************************************************/ /* Page table related */ /**************************************************************************/ /* Allocate pv_entry structure. */ pv_entry_t pv_alloc(void) { pv_entry_t pv; pv_entry_count++; if (pv_entry_count > pv_entry_high_water) pagedaemon_wakeup(0); /* XXX powerpc NUMA */ pv = uma_zalloc(pvzone, M_NOWAIT); return (pv); } /* Free pv_entry structure. */ static __inline void pv_free(pv_entry_t pve) { pv_entry_count--; uma_zfree(pvzone, pve); } /* Allocate and initialize pv_entry structure. */ static void pv_insert(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pve; //int su = (pmap == kernel_pmap); //debugf("pv_insert: s (su = %d pmap = 0x%08x va = 0x%08x m = 0x%08x)\n", su, // (u_int32_t)pmap, va, (u_int32_t)m); pve = pv_alloc(); if (pve == NULL) panic("pv_insert: no pv entries!"); pve->pv_pmap = pmap; pve->pv_va = va; /* add to pv_list */ PMAP_LOCK_ASSERT(pmap, MA_OWNED); rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_INSERT_TAIL(&m->md.pv_list, pve, pv_link); //debugf("pv_insert: e\n"); } /* Destroy pv entry. */ static void pv_remove(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pve; //int su = (pmap == kernel_pmap); //debugf("pv_remove: s (su = %d pmap = 0x%08x va = 0x%08x)\n", su, (u_int32_t)pmap, va); PMAP_LOCK_ASSERT(pmap, MA_OWNED); rw_assert(&pvh_global_lock, RA_WLOCKED); /* find pv entry */ TAILQ_FOREACH(pve, &m->md.pv_list, pv_link) { if ((pmap == pve->pv_pmap) && (va == pve->pv_va)) { /* remove from pv_list */ TAILQ_REMOVE(&m->md.pv_list, pve, pv_link); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); /* free pv entry struct */ pv_free(pve); break; } } //debugf("pv_remove: e\n"); } /**************************************************************************/ /* PMAP related */ /**************************************************************************/ /* * This is called during booke_init, before the system is really initialized. */ static void mmu_booke_bootstrap(vm_offset_t start, vm_offset_t kernelend) { vm_paddr_t phys_kernelend; struct mem_region *mp, *mp1; int cnt, i, j; vm_paddr_t s, e, sz; vm_paddr_t physsz, hwphyssz; u_int phys_avail_count; vm_size_t kstack0_sz; vm_paddr_t kstack0_phys; vm_offset_t kstack0; void *dpcpu; debugf("mmu_booke_bootstrap: entered\n"); /* Set interesting system properties */ #ifdef __powerpc64__ hw_direct_map = 1; #else hw_direct_map = 0; #endif #if defined(COMPAT_FREEBSD32) || !defined(__powerpc64__) elf32_nxstack = 1; #endif /* Initialize invalidation mutex */ mtx_init(&tlbivax_mutex, "tlbivax", NULL, MTX_SPIN); /* Read TLB0 size and associativity. */ tlb0_get_tlbconf(); /* * Align kernel start and end address (kernel image). * Note that kernel end does not necessarily relate to kernsize. * kernsize is the size of the kernel that is actually mapped. */ data_start = round_page(kernelend); data_end = data_start; /* Allocate the dynamic per-cpu area. */ dpcpu = (void *)data_end; data_end += DPCPU_SIZE; /* Allocate space for the message buffer. */ msgbufp = (struct msgbuf *)data_end; data_end += msgbufsize; debugf(" msgbufp at 0x%"PRI0ptrX" end = 0x%"PRI0ptrX"\n", (uintptr_t)msgbufp, data_end); data_end = round_page(data_end); data_end = round_page(mmu_booke_alloc_kernel_pgtables(data_end)); /* Retrieve phys/avail mem regions */ mem_regions(&physmem_regions, &physmem_regions_sz, &availmem_regions, &availmem_regions_sz); if (PHYS_AVAIL_ENTRIES < availmem_regions_sz) panic("mmu_booke_bootstrap: phys_avail too small"); data_end = round_page(data_end); vm_page_array = (vm_page_t)data_end; /* * Get a rough idea (upper bound) on the size of the page array. The * vm_page_array will not handle any more pages than we have in the * avail_regions array, and most likely much less. */ sz = 0; for (mp = availmem_regions; mp->mr_size; mp++) { sz += mp->mr_size; } sz = (round_page(sz) / (PAGE_SIZE + sizeof(struct vm_page))); data_end += round_page(sz * sizeof(struct vm_page)); /* Pre-round up to 1MB. This wastes some space, but saves TLB entries */ data_end = roundup2(data_end, 1 << 20); debugf(" data_end: 0x%"PRI0ptrX"\n", data_end); debugf(" kernstart: %#zx\n", kernstart); debugf(" kernsize: %#zx\n", kernsize); if (data_end - kernstart > kernsize) { kernsize += tlb1_mapin_region(kernstart + kernsize, kernload + kernsize, (data_end - kernstart) - kernsize, _TLB_ENTRY_MEM); } data_end = kernstart + kernsize; debugf(" updated data_end: 0x%"PRI0ptrX"\n", data_end); /* * Clear the structures - note we can only do it safely after the * possible additional TLB1 translations are in place (above) so that * all range up to the currently calculated 'data_end' is covered. */ bzero((void *)data_start, data_end - data_start); dpcpu_init(dpcpu, 0); /*******************************************************/ /* Set the start and end of kva. */ /*******************************************************/ virtual_avail = round_page(data_end); virtual_end = VM_MAX_KERNEL_ADDRESS; #ifndef __powerpc64__ /* Allocate KVA space for page zero/copy operations. */ zero_page_va = virtual_avail; virtual_avail += PAGE_SIZE; copy_page_src_va = virtual_avail; virtual_avail += PAGE_SIZE; copy_page_dst_va = virtual_avail; virtual_avail += PAGE_SIZE; debugf("zero_page_va = 0x%"PRI0ptrX"\n", zero_page_va); debugf("copy_page_src_va = 0x%"PRI0ptrX"\n", copy_page_src_va); debugf("copy_page_dst_va = 0x%"PRI0ptrX"\n", copy_page_dst_va); /* Initialize page zero/copy mutexes. */ mtx_init(&zero_page_mutex, "mmu_booke_zero_page", NULL, MTX_DEF); mtx_init(©_page_mutex, "mmu_booke_copy_page", NULL, MTX_DEF); /* Allocate KVA space for ptbl bufs. */ ptbl_buf_pool_vabase = virtual_avail; virtual_avail += PTBL_BUFS * PTBL_PAGES * PAGE_SIZE; debugf("ptbl_buf_pool_vabase = 0x%"PRI0ptrX" end = 0x%"PRI0ptrX"\n", ptbl_buf_pool_vabase, virtual_avail); #endif /* Calculate corresponding physical addresses for the kernel region. */ phys_kernelend = kernload + kernsize; debugf("kernel image and allocated data:\n"); debugf(" kernload = 0x%09jx\n", (uintmax_t)kernload); debugf(" kernstart = 0x%"PRI0ptrX"\n", kernstart); debugf(" kernsize = 0x%"PRI0ptrX"\n", kernsize); /* * Remove kernel physical address range from avail regions list. Page * align all regions. Non-page aligned memory isn't very interesting * to us. Also, sort the entries for ascending addresses. */ sz = 0; cnt = availmem_regions_sz; debugf("processing avail regions:\n"); for (mp = availmem_regions; mp->mr_size; mp++) { s = mp->mr_start; e = mp->mr_start + mp->mr_size; debugf(" %09jx-%09jx -> ", (uintmax_t)s, (uintmax_t)e); /* Check whether this region holds all of the kernel. */ if (s < kernload && e > phys_kernelend) { availmem_regions[cnt].mr_start = phys_kernelend; availmem_regions[cnt++].mr_size = e - phys_kernelend; e = kernload; } /* Look whether this regions starts within the kernel. */ if (s >= kernload && s < phys_kernelend) { if (e <= phys_kernelend) goto empty; s = phys_kernelend; } /* Now look whether this region ends within the kernel. */ if (e > kernload && e <= phys_kernelend) { if (s >= kernload) goto empty; e = kernload; } /* Now page align the start and size of the region. */ s = round_page(s); e = trunc_page(e); if (e < s) e = s; sz = e - s; debugf("%09jx-%09jx = %jx\n", (uintmax_t)s, (uintmax_t)e, (uintmax_t)sz); /* Check whether some memory is left here. */ if (sz == 0) { empty: memmove(mp, mp + 1, (cnt - (mp - availmem_regions)) * sizeof(*mp)); cnt--; mp--; continue; } /* Do an insertion sort. */ for (mp1 = availmem_regions; mp1 < mp; mp1++) if (s < mp1->mr_start) break; if (mp1 < mp) { memmove(mp1 + 1, mp1, (char *)mp - (char *)mp1); mp1->mr_start = s; mp1->mr_size = sz; } else { mp->mr_start = s; mp->mr_size = sz; } } availmem_regions_sz = cnt; /*******************************************************/ /* Steal physical memory for kernel stack from the end */ /* of the first avail region */ /*******************************************************/ kstack0_sz = kstack_pages * PAGE_SIZE; kstack0_phys = availmem_regions[0].mr_start + availmem_regions[0].mr_size; kstack0_phys -= kstack0_sz; availmem_regions[0].mr_size -= kstack0_sz; /*******************************************************/ /* Fill in phys_avail table, based on availmem_regions */ /*******************************************************/ phys_avail_count = 0; physsz = 0; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); debugf("fill in phys_avail:\n"); for (i = 0, j = 0; i < availmem_regions_sz; i++, j += 2) { debugf(" region: 0x%jx - 0x%jx (0x%jx)\n", (uintmax_t)availmem_regions[i].mr_start, (uintmax_t)availmem_regions[i].mr_start + availmem_regions[i].mr_size, (uintmax_t)availmem_regions[i].mr_size); if (hwphyssz != 0 && (physsz + availmem_regions[i].mr_size) >= hwphyssz) { debugf(" hw.physmem adjust\n"); if (physsz < hwphyssz) { phys_avail[j] = availmem_regions[i].mr_start; phys_avail[j + 1] = availmem_regions[i].mr_start + hwphyssz - physsz; physsz = hwphyssz; phys_avail_count++; dump_avail[j] = phys_avail[j]; dump_avail[j + 1] = phys_avail[j + 1]; } break; } phys_avail[j] = availmem_regions[i].mr_start; phys_avail[j + 1] = availmem_regions[i].mr_start + availmem_regions[i].mr_size; phys_avail_count++; physsz += availmem_regions[i].mr_size; dump_avail[j] = phys_avail[j]; dump_avail[j + 1] = phys_avail[j + 1]; } physmem = btoc(physsz); /* Calculate the last available physical address. */ for (i = 0; phys_avail[i + 2] != 0; i += 2) ; Maxmem = powerpc_btop(phys_avail[i + 1]); debugf("Maxmem = 0x%08lx\n", Maxmem); debugf("phys_avail_count = %d\n", phys_avail_count); debugf("physsz = 0x%09jx physmem = %jd (0x%09jx)\n", (uintmax_t)physsz, (uintmax_t)physmem, (uintmax_t)physmem); #ifdef __powerpc64__ /* * Map the physical memory contiguously in TLB1. * Round so it fits into a single mapping. */ tlb1_mapin_region(DMAP_BASE_ADDRESS, 0, phys_avail[i + 1], _TLB_ENTRY_MEM); #endif /*******************************************************/ /* Initialize (statically allocated) kernel pmap. */ /*******************************************************/ PMAP_LOCK_INIT(kernel_pmap); debugf("kernel_pmap = 0x%"PRI0ptrX"\n", (uintptr_t)kernel_pmap); kernel_pte_alloc(virtual_avail, kernstart); for (i = 0; i < MAXCPU; i++) { kernel_pmap->pm_tid[i] = TID_KERNEL; /* Initialize each CPU's tidbusy entry 0 with kernel_pmap */ tidbusy[i][TID_KERNEL] = kernel_pmap; } /* Mark kernel_pmap active on all CPUs */ CPU_FILL(&kernel_pmap->pm_active); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); /*******************************************************/ /* Final setup */ /*******************************************************/ /* Enter kstack0 into kernel map, provide guard page */ kstack0 = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; thread0.td_kstack = kstack0; thread0.td_kstack_pages = kstack_pages; debugf("kstack_sz = 0x%08jx\n", (uintmax_t)kstack0_sz); debugf("kstack0_phys at 0x%09jx - 0x%09jx\n", (uintmax_t)kstack0_phys, (uintmax_t)kstack0_phys + kstack0_sz); debugf("kstack0 at 0x%"PRI0ptrX" - 0x%"PRI0ptrX"\n", kstack0, kstack0 + kstack0_sz); virtual_avail += KSTACK_GUARD_PAGES * PAGE_SIZE + kstack0_sz; for (i = 0; i < kstack_pages; i++) { mmu_booke_kenter(kstack0, kstack0_phys); kstack0 += PAGE_SIZE; kstack0_phys += PAGE_SIZE; } pmap_bootstrapped = 1; debugf("virtual_avail = %"PRI0ptrX"\n", virtual_avail); debugf("virtual_end = %"PRI0ptrX"\n", virtual_end); debugf("mmu_booke_bootstrap: exit\n"); } #ifdef SMP void tlb1_ap_prep(void) { tlb_entry_t *e, tmp; unsigned int i; /* Prepare TLB1 image for AP processors */ e = __boot_tlb1; for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&tmp, i); if ((tmp.mas1 & MAS1_VALID) && (tmp.mas2 & _TLB_ENTRY_SHARED)) memcpy(e++, &tmp, sizeof(tmp)); } } void pmap_bootstrap_ap(volatile uint32_t *trcp __unused) { int i; /* * Finish TLB1 configuration: the BSP already set up its TLB1 and we * have the snapshot of its contents in the s/w __boot_tlb1[] table * created by tlb1_ap_prep(), so use these values directly to * (re)program AP's TLB1 hardware. * * Start at index 1 because index 0 has the kernel map. */ for (i = 1; i < TLB1_ENTRIES; i++) { if (__boot_tlb1[i].mas1 & MAS1_VALID) tlb1_write_entry(&__boot_tlb1[i], i); } set_mas4_defaults(); } #endif static void booke_pmap_init_qpages(void) { struct pcpu *pc; int i; CPU_FOREACH(i) { pc = pcpu_find(i); pc->pc_qmap_addr = kva_alloc(PAGE_SIZE); if (pc->pc_qmap_addr == 0) panic("pmap_init_qpages: unable to allocate KVA"); } } SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, booke_pmap_init_qpages, NULL); /* * Get the physical page address for the given pmap/virtual address. */ static vm_paddr_t mmu_booke_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t pa; PMAP_LOCK(pmap); pa = pte_vatopa(pmap, va); PMAP_UNLOCK(pmap); return (pa); } /* * Extract the physical page address associated with the given * kernel virtual address. */ static vm_paddr_t mmu_booke_kextract(vm_offset_t va) { tlb_entry_t e; vm_paddr_t p = 0; int i; #ifdef __powerpc64__ if (va >= DMAP_BASE_ADDRESS && va <= DMAP_MAX_ADDRESS) return (DMAP_TO_PHYS(va)); #endif if (va >= VM_MIN_KERNEL_ADDRESS && va <= VM_MAX_KERNEL_ADDRESS) p = pte_vatopa(kernel_pmap, va); if (p == 0) { /* Check TLB1 mappings */ for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) continue; if (va >= e.virt && va < e.virt + e.size) return (e.phys + (va - e.virt)); } } return (p); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ static void mmu_booke_init() { int shpgperproc = PMAP_SHPGPERPROC; /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_high_water = 9 * (pv_entry_max / 10); uma_zone_reserve_kva(pvzone, pv_entry_max); /* Pre-fill pvzone with initial number of pv entries. */ uma_prealloc(pvzone, PV_ENTRY_ZONE_MIN); /* Create a UMA zone for page table roots. */ ptbl_root_zone = uma_zcreate("pmap root", PMAP_ROOT_SIZE, NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, UMA_ZONE_VM); /* Initialize ptbl allocation. */ ptbl_init(); } /* * Map a list of wired pages into kernel virtual address space. This is * intended for temporary mappings which do not need page modification or * references recorded. Existing mappings in the region are overwritten. */ static void mmu_booke_qenter(vm_offset_t sva, vm_page_t *m, int count) { vm_offset_t va; va = sva; while (count-- > 0) { mmu_booke_kenter(va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } } /* * Remove page mappings from kernel virtual address space. Intended for * temporary mappings entered by mmu_booke_qenter. */ static void mmu_booke_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { mmu_booke_kremove(va); va += PAGE_SIZE; } } /* * Map a wired page into kernel virtual address space. */ static void mmu_booke_kenter(vm_offset_t va, vm_paddr_t pa) { mmu_booke_kenter_attr(va, pa, VM_MEMATTR_DEFAULT); } static void mmu_booke_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { uint32_t flags; pte_t *pte; KASSERT(((va >= VM_MIN_KERNEL_ADDRESS) && (va <= VM_MAX_KERNEL_ADDRESS)), ("mmu_booke_kenter: invalid va")); flags = PTE_SR | PTE_SW | PTE_SX | PTE_WIRED | PTE_VALID; flags |= tlb_calc_wimg(pa, ma) << PTE_MAS2_SHIFT; flags |= PTE_PS_4KB; pte = pte_find(kernel_pmap, va); KASSERT((pte != NULL), ("mmu_booke_kenter: invalid va. NULL PTE")); mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); if (PTE_ISVALID(pte)) { CTR1(KTR_PMAP, "%s: replacing entry!", __func__); /* Flush entry from TLB0 */ tlb0_flush_entry(va); } *pte = PTE_RPN_FROM_PA(pa) | flags; //debugf("mmu_booke_kenter: pdir_idx = %d ptbl_idx = %d va=0x%08x " // "pa=0x%08x rpn=0x%08x flags=0x%08x\n", // pdir_idx, ptbl_idx, va, pa, pte->rpn, pte->flags); /* Flush the real memory from the instruction cache. */ if ((flags & (PTE_I | PTE_G)) == 0) __syncicache((void *)va, PAGE_SIZE); tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } /* * Remove a page from kernel page table. */ static void mmu_booke_kremove(vm_offset_t va) { pte_t *pte; CTR2(KTR_PMAP,"%s: s (va = 0x%"PRI0ptrX")\n", __func__, va); KASSERT(((va >= VM_MIN_KERNEL_ADDRESS) && (va <= VM_MAX_KERNEL_ADDRESS)), ("mmu_booke_kremove: invalid va")); pte = pte_find(kernel_pmap, va); if (!PTE_ISVALID(pte)) { CTR1(KTR_PMAP, "%s: invalid pte", __func__); return; } mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); /* Invalidate entry in TLB0, update PTE. */ tlb0_flush_entry(va); *pte = 0; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } /* * Figure out where a given kernel pointer (usually in a fault) points * to from the VM's perspective, potentially remapping into userland's * address space. */ static int mmu_booke_decode_kernel_ptr(vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr) { if (trunc_page(addr) <= VM_MAXUSER_ADDRESS) *is_user = 1; else *is_user = 0; *decoded_addr = addr; return (0); } static boolean_t mmu_booke_page_is_mapped(vm_page_t m) { return (!TAILQ_EMPTY(&(m)->md.pv_list)); } /* * Initialize pmap associated with process 0. */ static void mmu_booke_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); mmu_booke_pinit(pmap); PCPU_SET(curpmap, pmap); } /* * Insert the given physical page at the specified virtual address in the * target physical map with the protection requested. If specified the page * will be wired down. */ static int mmu_booke_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { int error; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); error = mmu_booke_enter_locked(pmap, va, m, prot, flags, psind); PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); return (error); } static int mmu_booke_enter_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int pmap_flags, int8_t psind __unused) { pte_t *pte; vm_paddr_t pa; pte_t flags; int error, su, sync; pa = VM_PAGE_TO_PHYS(m); su = (pmap == kernel_pmap); sync = 0; //debugf("mmu_booke_enter_locked: s (pmap=0x%08x su=%d tid=%d m=0x%08x va=0x%08x " // "pa=0x%08x prot=0x%08x flags=%#x)\n", // (u_int32_t)pmap, su, pmap->pm_tid, // (u_int32_t)m, va, pa, prot, flags); if (su) { KASSERT(((va >= virtual_avail) && (va <= VM_MAX_KERNEL_ADDRESS)), ("mmu_booke_enter_locked: kernel pmap, non kernel va")); } else { KASSERT((va <= VM_MAXUSER_ADDRESS), ("mmu_booke_enter_locked: user pmap, non user va")); } if ((m->oflags & VPO_UNMANAGED) == 0) { if ((pmap_flags & PMAP_ENTER_QUICK_LOCKED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); else VM_OBJECT_ASSERT_LOCKED(m->object); } PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * If there is an existing mapping, and the physical address has not * changed, must be protection or wiring change. */ if (((pte = pte_find(pmap, va)) != NULL) && (PTE_ISVALID(pte)) && (PTE_PA(pte) == pa)) { /* * Before actually updating pte->flags we calculate and * prepare its new value in a helper var. */ flags = *pte; flags &= ~(PTE_UW | PTE_UX | PTE_SW | PTE_SX | PTE_MODIFIED); /* Wiring change, just update stats. */ if ((pmap_flags & PMAP_ENTER_WIRED) != 0) { if (!PTE_ISWIRED(pte)) { flags |= PTE_WIRED; pmap->pm_stats.wired_count++; } } else { if (PTE_ISWIRED(pte)) { flags &= ~PTE_WIRED; pmap->pm_stats.wired_count--; } } if (prot & VM_PROT_WRITE) { /* Add write permissions. */ flags |= PTE_SW; if (!su) flags |= PTE_UW; if ((flags & PTE_MANAGED) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } else { /* Handle modified pages, sense modify status. */ /* * The PTE_MODIFIED flag could be set by underlying * TLB misses since we last read it (above), possibly * other CPUs could update it so we check in the PTE * directly rather than rely on that saved local flags * copy. */ if (PTE_ISMODIFIED(pte)) vm_page_dirty(m); } if (prot & VM_PROT_EXECUTE) { flags |= PTE_SX; if (!su) flags |= PTE_UX; /* * Check existing flags for execute permissions: if we * are turning execute permissions on, icache should * be flushed. */ if ((*pte & (PTE_UX | PTE_SX)) == 0) sync++; } flags &= ~PTE_REFERENCED; /* * The new flags value is all calculated -- only now actually * update the PTE. */ mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); tlb0_flush_entry(va); *pte &= ~PTE_FLAGS_MASK; *pte |= flags; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } else { /* * If there is an existing mapping, but it's for a different * physical address, pte_enter() will delete the old mapping. */ //if ((pte != NULL) && PTE_ISVALID(pte)) // debugf("mmu_booke_enter_locked: replace\n"); //else // debugf("mmu_booke_enter_locked: new\n"); /* Now set up the flags and install the new mapping. */ flags = (PTE_SR | PTE_VALID); flags |= PTE_M; if (!su) flags |= PTE_UR; if (prot & VM_PROT_WRITE) { flags |= PTE_SW; if (!su) flags |= PTE_UW; if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); } if (prot & VM_PROT_EXECUTE) { flags |= PTE_SX; if (!su) flags |= PTE_UX; } /* If its wired update stats. */ if ((pmap_flags & PMAP_ENTER_WIRED) != 0) flags |= PTE_WIRED; error = pte_enter(pmap, m, va, flags, (pmap_flags & PMAP_ENTER_NOSLEEP) != 0); if (error != 0) return (KERN_RESOURCE_SHORTAGE); if ((flags & PMAP_ENTER_WIRED) != 0) pmap->pm_stats.wired_count++; /* Flush the real memory from the instruction cache. */ if (prot & VM_PROT_EXECUTE) sync++; } if (sync && (su || pmap == PCPU_GET(curpmap))) { __syncicache((void *)va, PAGE_SIZE); sync = 0; } return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ static void mmu_booke_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { mmu_booke_enter_locked(pmap, start + ptoa(diff), m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, 0); m = TAILQ_NEXT(m, listq); } PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); } static void mmu_booke_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); mmu_booke_enter_locked(pmap, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, 0); PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly rounded to the page size. */ static void mmu_booke_remove(pmap_t pmap, vm_offset_t va, vm_offset_t endva) { pte_t *pte; uint8_t hold_flag; int su = (pmap == kernel_pmap); //debugf("mmu_booke_remove: s (su = %d pmap=0x%08x tid=%d va=0x%08x endva=0x%08x)\n", // su, (u_int32_t)pmap, pmap->pm_tid, va, endva); if (su) { KASSERT(((va >= virtual_avail) && (va <= VM_MAX_KERNEL_ADDRESS)), ("mmu_booke_remove: kernel pmap, non kernel va")); } else { KASSERT((va <= VM_MAXUSER_ADDRESS), ("mmu_booke_remove: user pmap, non user va")); } if (PMAP_REMOVE_DONE(pmap)) { //debugf("mmu_booke_remove: e (empty)\n"); return; } hold_flag = PTBL_HOLD_FLAG(pmap); //debugf("mmu_booke_remove: hold_flag = %d\n", hold_flag); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); for (; va < endva; va += PAGE_SIZE) { pte = pte_find_next(pmap, &va); if ((pte == NULL) || !PTE_ISVALID(pte)) break; if (va >= endva) break; pte_remove(pmap, va, hold_flag); } PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); //debugf("mmu_booke_remove: e\n"); } /* * Remove physical page from all pmaps in which it resides. */ static void mmu_booke_remove_all(vm_page_t m) { pv_entry_t pv, pvn; uint8_t hold_flag; rw_wlock(&pvh_global_lock); TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_link, pvn) { PMAP_LOCK(pv->pv_pmap); hold_flag = PTBL_HOLD_FLAG(pv->pv_pmap); pte_remove(pv->pv_pmap, pv->pv_va, hold_flag); PMAP_UNLOCK(pv->pv_pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); } /* * Map a range of physical addresses into kernel virtual address space. */ static vm_offset_t mmu_booke_map(vm_offset_t *virt, vm_paddr_t pa_start, vm_paddr_t pa_end, int prot) { vm_offset_t sva = *virt; vm_offset_t va = sva; #ifdef __powerpc64__ /* XXX: Handle memory not starting at 0x0. */ if (pa_end < ctob(Maxmem)) return (PHYS_TO_DMAP(pa_start)); #endif while (pa_start < pa_end) { mmu_booke_kenter(va, pa_start); va += PAGE_SIZE; pa_start += PAGE_SIZE; } *virt = va; return (sva); } /* * The pmap must be activated before it's address space can be accessed in any * way. */ static void mmu_booke_activate(struct thread *td) { pmap_t pmap; u_int cpuid; pmap = &td->td_proc->p_vmspace->vm_pmap; CTR5(KTR_PMAP, "%s: s (td = %p, proc = '%s', id = %d, pmap = 0x%"PRI0ptrX")", __func__, td, td->td_proc->p_comm, td->td_proc->p_pid, pmap); KASSERT((pmap != kernel_pmap), ("mmu_booke_activate: kernel_pmap!")); sched_pin(); cpuid = PCPU_GET(cpuid); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); PCPU_SET(curpmap, pmap); if (pmap->pm_tid[cpuid] == TID_NONE) tid_alloc(pmap); /* Load PID0 register with pmap tid value. */ mtspr(SPR_PID0, pmap->pm_tid[cpuid]); __asm __volatile("isync"); mtspr(SPR_DBCR0, td->td_pcb->pcb_cpu.booke.dbcr0); sched_unpin(); CTR3(KTR_PMAP, "%s: e (tid = %d for '%s')", __func__, pmap->pm_tid[PCPU_GET(cpuid)], td->td_proc->p_comm); } /* * Deactivate the specified process's address space. */ static void mmu_booke_deactivate(struct thread *td) { pmap_t pmap; pmap = &td->td_proc->p_vmspace->vm_pmap; CTR5(KTR_PMAP, "%s: td=%p, proc = '%s', id = %d, pmap = 0x%"PRI0ptrX, __func__, td, td->td_proc->p_comm, td->td_proc->p_pid, pmap); td->td_pcb->pcb_cpu.booke.dbcr0 = mfspr(SPR_DBCR0); CPU_CLR_ATOMIC(PCPU_GET(cpuid), &pmap->pm_active); PCPU_SET(curpmap, NULL); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ static void mmu_booke_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * Set the physical protection on the specified range of this map as requested. */ static void mmu_booke_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va; vm_page_t m; pte_t *pte; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { mmu_booke_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; PMAP_LOCK(pmap); for (va = sva; va < eva; va += PAGE_SIZE) { if ((pte = pte_find(pmap, va)) != NULL) { if (PTE_ISVALID(pte)) { m = PHYS_TO_VM_PAGE(PTE_PA(pte)); mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); /* Handle modified pages. */ if (PTE_ISMODIFIED(pte) && PTE_ISMANAGED(pte)) vm_page_dirty(m); tlb0_flush_entry(va); *pte &= ~(PTE_UW | PTE_SW | PTE_MODIFIED); tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } } } PMAP_UNLOCK(pmap); } /* * Clear the write and modified bits in each of the given page's mappings. */ static void mmu_booke_remove_write(vm_page_t m) { pv_entry_t pv; pte_t *pte; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(pv->pv_pmap, pv->pv_va)) != NULL) { if (PTE_ISVALID(pte)) { m = PHYS_TO_VM_PAGE(PTE_PA(pte)); mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); /* Handle modified pages. */ if (PTE_ISMODIFIED(pte)) vm_page_dirty(m); /* Flush mapping from TLB0. */ *pte &= ~(PTE_UW | PTE_SW | PTE_MODIFIED); tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } } PMAP_UNLOCK(pv->pv_pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. */ static vm_page_t mmu_booke_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pte_t *pte; vm_page_t m; uint32_t pte_wbit; m = NULL; PMAP_LOCK(pmap); pte = pte_find(pmap, va); if ((pte != NULL) && PTE_ISVALID(pte)) { if (pmap == kernel_pmap) pte_wbit = PTE_SW; else pte_wbit = PTE_UW; if ((*pte & pte_wbit) != 0 || (prot & VM_PROT_WRITE) == 0) { m = PHYS_TO_VM_PAGE(PTE_PA(pte)); if (!vm_page_wire_mapped(m)) m = NULL; } } PMAP_UNLOCK(pmap); return (m); } /* * Initialize a vm_page's machine-dependent fields. */ static void mmu_booke_page_init(vm_page_t m) { m->md.pv_tracked = 0; TAILQ_INIT(&m->md.pv_list); } /* * Return whether or not the specified physical page was modified * in any of physical maps. */ static boolean_t mmu_booke_is_modified(vm_page_t m) { pte_t *pte; pv_entry_t pv; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_is_modified: page %p is not managed", m)); rv = FALSE; /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(pv->pv_pmap, pv->pv_va)) != NULL && PTE_ISVALID(pte)) { if (PTE_ISMODIFIED(pte)) rv = TRUE; } PMAP_UNLOCK(pv->pv_pmap); if (rv) break; } rw_wunlock(&pvh_global_lock); return (rv); } /* * Return whether or not the specified virtual address is eligible * for prefault. */ static boolean_t mmu_booke_is_prefaultable(pmap_t pmap, vm_offset_t addr) { return (FALSE); } /* * Return whether or not the specified physical page was referenced * in any physical maps. */ static boolean_t mmu_booke_is_referenced(vm_page_t m) { pte_t *pte; pv_entry_t pv; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_is_referenced: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(pv->pv_pmap, pv->pv_va)) != NULL && PTE_ISVALID(pte)) { if (PTE_ISREFERENCED(pte)) rv = TRUE; } PMAP_UNLOCK(pv->pv_pmap); if (rv) break; } rw_wunlock(&pvh_global_lock); return (rv); } /* * Clear the modify bits on the specified physical page. */ static void mmu_booke_clear_modify(vm_page_t m) { pte_t *pte; pv_entry_t pv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(pv->pv_pmap, pv->pv_va)) != NULL && PTE_ISVALID(pte)) { mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); if (*pte & (PTE_SW | PTE_UW | PTE_MODIFIED)) { tlb0_flush_entry(pv->pv_va); *pte &= ~(PTE_SW | PTE_UW | PTE_MODIFIED | PTE_REFERENCED); } tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } PMAP_UNLOCK(pv->pv_pmap); } rw_wunlock(&pvh_global_lock); } /* * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ static int mmu_booke_ts_referenced(vm_page_t m) { pte_t *pte; pv_entry_t pv; int count; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_ts_referenced: page %p is not managed", m)); count = 0; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(pv->pv_pmap, pv->pv_va)) != NULL && PTE_ISVALID(pte)) { if (PTE_ISMODIFIED(pte)) vm_page_dirty(m); if (PTE_ISREFERENCED(pte)) { mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); tlb0_flush_entry(pv->pv_va); *pte &= ~PTE_REFERENCED; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); if (++count >= PMAP_TS_REFERENCED_MAX) { PMAP_UNLOCK(pv->pv_pmap); break; } } } PMAP_UNLOCK(pv->pv_pmap); } rw_wunlock(&pvh_global_lock); return (count); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range must * have the wired attribute set. In contrast, invalid mappings cannot have * the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, so * there is no need to invalidate any TLB entries. */ static void mmu_booke_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va; pte_t *pte; PMAP_LOCK(pmap); for (va = sva; va < eva; va += PAGE_SIZE) { if ((pte = pte_find(pmap, va)) != NULL && PTE_ISVALID(pte)) { if (!PTE_ISWIRED(pte)) panic("mmu_booke_unwire: pte %p isn't wired", pte); *pte &= ~PTE_WIRED; pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Return true if the pmap's pv is one of the first 16 pvs linked to from this * page. This count may be changed upwards or downwards in the future; it is * only necessary that true be returned for a small subset of pmaps for proper * page aging. */ static boolean_t mmu_booke_page_exists_quick(pmap_t pmap, vm_page_t m) { pv_entry_t pv; int loops; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_page_exists_quick: page %p is not managed", m)); loops = 0; rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { if (pv->pv_pmap == pmap) { rv = TRUE; break; } if (++loops >= 16) break; } rw_wunlock(&pvh_global_lock); return (rv); } /* * Return the number of managed mappings to the given physical page that are * wired. */ static int mmu_booke_page_wired_mappings(vm_page_t m) { pv_entry_t pv; pte_t *pte; int count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(pv->pv_pmap, pv->pv_va)) != NULL) if (PTE_ISVALID(pte) && PTE_ISWIRED(pte)) count++; PMAP_UNLOCK(pv->pv_pmap); } rw_wunlock(&pvh_global_lock); return (count); } static int mmu_booke_dev_direct_mapped(vm_paddr_t pa, vm_size_t size) { int i; vm_offset_t va; /* * This currently does not work for entries that * overlap TLB1 entries. */ for (i = 0; i < TLB1_ENTRIES; i ++) { if (tlb1_iomapped(i, pa, size, &va) == 0) return (0); } return (EFAULT); } void mmu_booke_dumpsys_map(vm_paddr_t pa, size_t sz, void **va) { vm_paddr_t ppa; vm_offset_t ofs; vm_size_t gran; /* Minidumps are based on virtual memory addresses. */ if (do_minidump) { *va = (void *)(vm_offset_t)pa; return; } /* Raw physical memory dumps don't have a virtual address. */ /* We always map a 256MB page at 256M. */ gran = 256 * 1024 * 1024; ppa = rounddown2(pa, gran); ofs = pa - ppa; *va = (void *)gran; tlb1_set_entry((vm_offset_t)va, ppa, gran, _TLB_ENTRY_IO); if (sz > (gran - ofs)) tlb1_set_entry((vm_offset_t)(va + gran), ppa + gran, gran, _TLB_ENTRY_IO); } void mmu_booke_dumpsys_unmap(vm_paddr_t pa, size_t sz, void *va) { vm_paddr_t ppa; vm_offset_t ofs; vm_size_t gran; tlb_entry_t e; int i; /* Minidumps are based on virtual memory addresses. */ /* Nothing to do... */ if (do_minidump) return; for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) break; } /* Raw physical memory dumps don't have a virtual address. */ i--; e.mas1 = 0; e.mas2 = 0; e.mas3 = 0; tlb1_write_entry(&e, i); gran = 256 * 1024 * 1024; ppa = rounddown2(pa, gran); ofs = pa - ppa; if (sz > (gran - ofs)) { i--; e.mas1 = 0; e.mas2 = 0; e.mas3 = 0; tlb1_write_entry(&e, i); } } extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1]; void mmu_booke_scan_init() { vm_offset_t va; pte_t *pte; int i; if (!do_minidump) { /* Initialize phys. segments for dumpsys(). */ memset(&dump_map, 0, sizeof(dump_map)); mem_regions(&physmem_regions, &physmem_regions_sz, &availmem_regions, &availmem_regions_sz); for (i = 0; i < physmem_regions_sz; i++) { dump_map[i].pa_start = physmem_regions[i].mr_start; dump_map[i].pa_size = physmem_regions[i].mr_size; } return; } /* Virtual segments for minidumps: */ memset(&dump_map, 0, sizeof(dump_map)); /* 1st: kernel .data and .bss. */ dump_map[0].pa_start = trunc_page((uintptr_t)_etext); dump_map[0].pa_size = round_page((uintptr_t)_end) - dump_map[0].pa_start; /* 2nd: msgbuf and tables (see pmap_bootstrap()). */ dump_map[1].pa_start = data_start; dump_map[1].pa_size = data_end - data_start; /* 3rd: kernel VM. */ va = dump_map[1].pa_start + dump_map[1].pa_size; /* Find start of next chunk (from va). */ while (va < virtual_end) { /* Don't dump the buffer cache. */ if (va >= kmi.buffer_sva && va < kmi.buffer_eva) { va = kmi.buffer_eva; continue; } pte = pte_find(kernel_pmap, va); if (pte != NULL && PTE_ISVALID(pte)) break; va += PAGE_SIZE; } if (va < virtual_end) { dump_map[2].pa_start = va; va += PAGE_SIZE; /* Find last page in chunk. */ while (va < virtual_end) { /* Don't run into the buffer cache. */ if (va == kmi.buffer_sva) break; pte = pte_find(kernel_pmap, va); if (pte == NULL || !PTE_ISVALID(pte)) break; va += PAGE_SIZE; } dump_map[2].pa_size = va - dump_map[2].pa_start; } } /* * Map a set of physical memory pages into the kernel virtual address space. * Return a pointer to where it is mapped. This routine is intended to be used * for mapping device memory, NOT real memory. */ static void * mmu_booke_mapdev(vm_paddr_t pa, vm_size_t size) { return (mmu_booke_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT)); } static int tlb1_find_pa(vm_paddr_t pa, tlb_entry_t *e) { int i; for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(e, i); if ((e->mas1 & MAS1_VALID) == 0) continue; if (e->phys == pa) return (i); } return (-1); } static void * mmu_booke_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { tlb_entry_t e; vm_paddr_t tmppa; #ifndef __powerpc64__ uintptr_t tmpva; #endif uintptr_t va, retva; vm_size_t sz; int i; int wimge; /* * Check if this is premapped in TLB1. */ sz = size; tmppa = pa; va = ~0; wimge = tlb_calc_wimg(pa, ma); for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) continue; if (wimge != (e.mas2 & (MAS2_WIMGE_MASK & ~_TLB_ENTRY_SHARED))) continue; if (tmppa >= e.phys && tmppa < e.phys + e.size) { va = e.virt + (pa - e.phys); tmppa = e.phys + e.size; sz -= MIN(sz, e.size - (pa - e.phys)); while (sz > 0 && (i = tlb1_find_pa(tmppa, &e)) != -1) { if (wimge != (e.mas2 & (MAS2_WIMGE_MASK & ~_TLB_ENTRY_SHARED))) break; sz -= MIN(sz, e.size); tmppa = e.phys + e.size; } if (sz != 0) break; return ((void *)va); } } size = roundup(size, PAGE_SIZE); #ifdef __powerpc64__ KASSERT(pa < VM_MAPDEV_PA_MAX, ("Unsupported physical address! %lx", pa)); va = VM_MAPDEV_BASE + pa; retva = va; #ifdef POW2_MAPPINGS /* * Align the mapping to a power of 2 size, taking into account that we * may need to increase the size multiple times to satisfy the size and * alignment requirements. * * This works in the general case because it's very rare (near never?) * to have different access properties (WIMG) within a single * power-of-two region. If a design does call for that, POW2_MAPPINGS * can be undefined, and exact mappings will be used instead. */ sz = size; size = roundup2(size, 1 << ilog2(size)); while (rounddown2(va, size) + size < va + sz) size <<= 1; va = rounddown2(va, size); pa = rounddown2(pa, size); #endif #else /* * The device mapping area is between VM_MAXUSER_ADDRESS and * VM_MIN_KERNEL_ADDRESS. This gives 1GB of device addressing. */ #ifdef SPARSE_MAPDEV /* * With a sparse mapdev, align to the largest starting region. This * could feasibly be optimized for a 'best-fit' alignment, but that * calculation could be very costly. * Align to the smaller of: * - first set bit in overlap of (pa & size mask) * - largest size envelope * * It's possible the device mapping may start at a PA that's not larger * than the size mask, so we need to offset in to maximize the TLB entry * range and minimize the number of used TLB entries. */ do { tmpva = tlb1_map_base; sz = ffsl((~((1 << flsl(size-1)) - 1)) & pa); sz = sz ? min(roundup(sz + 3, 4), flsl(size) - 1) : flsl(size) - 1; va = roundup(tlb1_map_base, 1 << sz) | (((1 << sz) - 1) & pa); } while (!atomic_cmpset_int(&tlb1_map_base, tmpva, va + size)); #endif va = atomic_fetchadd_int(&tlb1_map_base, size); retva = va; #endif if (tlb1_mapin_region(va, pa, size, tlb_calc_wimg(pa, ma)) != size) return (NULL); return ((void *)retva); } /* * 'Unmap' a range mapped by mmu_booke_mapdev(). */ static void mmu_booke_unmapdev(vm_offset_t va, vm_size_t size) { #ifdef SUPPORTS_SHRINKING_TLB1 vm_offset_t base, offset; /* * Unmap only if this is inside kernel virtual space. */ if ((va >= VM_MIN_KERNEL_ADDRESS) && (va <= VM_MAX_KERNEL_ADDRESS)) { base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); mmu_booke_qremove(base, atop(size)); kva_free(base, size); } #endif } /* * mmu_booke_object_init_pt preloads the ptes for a given object into the * specified pmap. This eliminates the blast of soft faults on process startup * and immediately after an mmap. */ static void mmu_booke_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("mmu_booke_object_init_pt: non-device object")); } /* * Perform the pmap work for mincore. */ static int mmu_booke_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { /* XXX: this should be implemented at some point */ return (0); } static int mmu_booke_change_attr(vm_offset_t addr, vm_size_t sz, vm_memattr_t mode) { vm_offset_t va; pte_t *pte; int i, j; tlb_entry_t e; addr = trunc_page(addr); /* Only allow changes to mapped kernel addresses. This includes: * - KVA * - DMAP (powerpc64) * - Device mappings */ if (addr <= VM_MAXUSER_ADDRESS || #ifdef __powerpc64__ (addr >= tlb1_map_base && addr < DMAP_BASE_ADDRESS) || (addr > DMAP_MAX_ADDRESS && addr < VM_MIN_KERNEL_ADDRESS) || #else (addr >= tlb1_map_base && addr < VM_MIN_KERNEL_ADDRESS) || #endif (addr > VM_MAX_KERNEL_ADDRESS)) return (EINVAL); /* Check TLB1 mappings */ for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) continue; if (addr >= e.virt && addr < e.virt + e.size) break; } if (i < TLB1_ENTRIES) { /* Only allow full mappings to be modified for now. */ /* Validate the range. */ for (j = i, va = addr; va < addr + sz; va += e.size, j++) { tlb1_read_entry(&e, j); if (va != e.virt || (sz - (va - addr) < e.size)) return (EINVAL); } for (va = addr; va < addr + sz; va += e.size, i++) { tlb1_read_entry(&e, i); e.mas2 &= ~MAS2_WIMGE_MASK; e.mas2 |= tlb_calc_wimg(e.phys, mode); /* * Write it out to the TLB. Should really re-sync with other * cores. */ tlb1_write_entry(&e, i); } return (0); } /* Not in TLB1, try through pmap */ /* First validate the range. */ for (va = addr; va < addr + sz; va += PAGE_SIZE) { pte = pte_find(kernel_pmap, va); if (pte == NULL || !PTE_ISVALID(pte)) return (EINVAL); } mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); for (va = addr; va < addr + sz; va += PAGE_SIZE) { pte = pte_find(kernel_pmap, va); *pte &= ~(PTE_MAS2_MASK << PTE_MAS2_SHIFT); *pte |= tlb_calc_wimg(PTE_PA(pte), mode) << PTE_MAS2_SHIFT; tlb0_flush_entry(va); } tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); return (0); } static void mmu_booke_page_array_startup(long pages) { vm_page_array_size = pages; } /**************************************************************************/ /* TID handling */ /**************************************************************************/ /* * Allocate a TID. If necessary, steal one from someone else. * The new TID is flushed from the TLB before returning. */ static tlbtid_t tid_alloc(pmap_t pmap) { tlbtid_t tid; int thiscpu; KASSERT((pmap != kernel_pmap), ("tid_alloc: kernel pmap")); CTR2(KTR_PMAP, "%s: s (pmap = %p)", __func__, pmap); thiscpu = PCPU_GET(cpuid); tid = PCPU_GET(booke.tid_next); if (tid > TID_MAX) tid = TID_MIN; PCPU_SET(booke.tid_next, tid + 1); /* If we are stealing TID then clear the relevant pmap's field */ if (tidbusy[thiscpu][tid] != NULL) { CTR2(KTR_PMAP, "%s: warning: stealing tid %d", __func__, tid); tidbusy[thiscpu][tid]->pm_tid[thiscpu] = TID_NONE; /* Flush all entries from TLB0 matching this TID. */ tid_flush(tid); } tidbusy[thiscpu][tid] = pmap; pmap->pm_tid[thiscpu] = tid; __asm __volatile("msync; isync"); CTR3(KTR_PMAP, "%s: e (%02d next = %02d)", __func__, tid, PCPU_GET(booke.tid_next)); return (tid); } /**************************************************************************/ /* TLB0 handling */ /**************************************************************************/ /* Convert TLB0 va and way number to tlb0[] table index. */ static inline unsigned int tlb0_tableidx(vm_offset_t va, unsigned int way) { unsigned int idx; idx = (way * TLB0_ENTRIES_PER_WAY); idx += (va & MAS2_TLB0_ENTRY_IDX_MASK) >> MAS2_TLB0_ENTRY_IDX_SHIFT; return (idx); } /* * Invalidate TLB0 entry. */ static inline void tlb0_flush_entry(vm_offset_t va) { CTR2(KTR_PMAP, "%s: s va=0x%08x", __func__, va); mtx_assert(&tlbivax_mutex, MA_OWNED); __asm __volatile("tlbivax 0, %0" :: "r"(va & MAS2_EPN_MASK)); __asm __volatile("isync; msync"); __asm __volatile("tlbsync; msync"); CTR1(KTR_PMAP, "%s: e", __func__); } /**************************************************************************/ /* TLB1 handling */ /**************************************************************************/ /* * TLB1 mapping notes: * * TLB1[0] Kernel text and data. * TLB1[1-15] Additional kernel text and data mappings (if required), PCI * windows, other devices mappings. */ /* * Read an entry from given TLB1 slot. */ void tlb1_read_entry(tlb_entry_t *entry, unsigned int slot) { register_t msr; uint32_t mas0; KASSERT((entry != NULL), ("%s(): Entry is NULL!", __func__)); msr = mfmsr(); __asm __volatile("wrteei 0"); mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(slot); mtspr(SPR_MAS0, mas0); __asm __volatile("isync; tlbre"); entry->mas1 = mfspr(SPR_MAS1); entry->mas2 = mfspr(SPR_MAS2); entry->mas3 = mfspr(SPR_MAS3); switch ((mfpvr() >> 16) & 0xFFFF) { case FSL_E500v2: case FSL_E500mc: case FSL_E5500: case FSL_E6500: entry->mas7 = mfspr(SPR_MAS7); break; default: entry->mas7 = 0; break; } __asm __volatile("wrtee %0" :: "r"(msr)); entry->virt = entry->mas2 & MAS2_EPN_MASK; entry->phys = ((vm_paddr_t)(entry->mas7 & MAS7_RPN) << 32) | (entry->mas3 & MAS3_RPN); entry->size = tsize2size((entry->mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT); } struct tlbwrite_args { tlb_entry_t *e; unsigned int idx; }; static uint32_t tlb1_find_free(void) { tlb_entry_t e; int i; for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if ((e.mas1 & MAS1_VALID) == 0) return (i); } return (-1); } static void tlb1_purge_va_range(vm_offset_t va, vm_size_t size) { tlb_entry_t e; int i; for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if ((e.mas1 & MAS1_VALID) == 0) continue; if ((e.mas2 & MAS2_EPN_MASK) >= va && (e.mas2 & MAS2_EPN_MASK) < va + size) { mtspr(SPR_MAS1, e.mas1 & ~MAS1_VALID); __asm __volatile("isync; tlbwe; isync; msync"); } } } static void tlb1_write_entry_int(void *arg) { struct tlbwrite_args *args = arg; uint32_t idx, mas0; idx = args->idx; if (idx == -1) { tlb1_purge_va_range(args->e->virt, args->e->size); idx = tlb1_find_free(); if (idx == -1) panic("No free TLB1 entries!\n"); } /* Select entry */ mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(idx); mtspr(SPR_MAS0, mas0); mtspr(SPR_MAS1, args->e->mas1); mtspr(SPR_MAS2, args->e->mas2); mtspr(SPR_MAS3, args->e->mas3); switch ((mfpvr() >> 16) & 0xFFFF) { case FSL_E500mc: case FSL_E5500: case FSL_E6500: mtspr(SPR_MAS8, 0); /* FALLTHROUGH */ case FSL_E500v2: mtspr(SPR_MAS7, args->e->mas7); break; default: break; } __asm __volatile("isync; tlbwe; isync; msync"); } static void tlb1_write_entry_sync(void *arg) { /* Empty synchronization point for smp_rendezvous(). */ } /* * Write given entry to TLB1 hardware. */ static void tlb1_write_entry(tlb_entry_t *e, unsigned int idx) { struct tlbwrite_args args; args.e = e; args.idx = idx; #ifdef SMP if ((e->mas2 & _TLB_ENTRY_SHARED) && smp_started) { mb(); smp_rendezvous(tlb1_write_entry_sync, tlb1_write_entry_int, tlb1_write_entry_sync, &args); } else #endif { register_t msr; msr = mfmsr(); __asm __volatile("wrteei 0"); tlb1_write_entry_int(&args); __asm __volatile("wrtee %0" :: "r"(msr)); } } /* * Convert TLB TSIZE value to mapped region size. */ static vm_size_t tsize2size(unsigned int tsize) { /* * size = 4^tsize KB * size = 4^tsize * 2^10 = 2^(2 * tsize - 10) */ return ((1 << (2 * tsize)) * 1024); } /* * Convert region size (must be power of 4) to TLB TSIZE value. */ static unsigned int size2tsize(vm_size_t size) { return (ilog2(size) / 2 - 5); } /* * Register permanent kernel mapping in TLB1. * * Entries are created starting from index 0 (current free entry is * kept in tlb1_idx) and are not supposed to be invalidated. */ int tlb1_set_entry(vm_offset_t va, vm_paddr_t pa, vm_size_t size, uint32_t flags) { tlb_entry_t e; uint32_t ts, tid; int tsize, index; /* First try to update an existing entry. */ for (index = 0; index < TLB1_ENTRIES; index++) { tlb1_read_entry(&e, index); /* Check if we're just updating the flags, and update them. */ if (e.phys == pa && e.virt == va && e.size == size) { e.mas2 = (va & MAS2_EPN_MASK) | flags; tlb1_write_entry(&e, index); return (0); } } /* Convert size to TSIZE */ tsize = size2tsize(size); tid = (TID_KERNEL << MAS1_TID_SHIFT) & MAS1_TID_MASK; /* XXX TS is hard coded to 0 for now as we only use single address space */ ts = (0 << MAS1_TS_SHIFT) & MAS1_TS_MASK; e.phys = pa; e.virt = va; e.size = size; e.mas1 = MAS1_VALID | MAS1_IPROT | ts | tid; e.mas1 |= ((tsize << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK); e.mas2 = (va & MAS2_EPN_MASK) | flags; /* Set supervisor RWX permission bits */ e.mas3 = (pa & MAS3_RPN) | MAS3_SR | MAS3_SW | MAS3_SX; e.mas7 = (pa >> 32) & MAS7_RPN; tlb1_write_entry(&e, -1); return (0); } /* * Map in contiguous RAM region into the TLB1. */ static vm_size_t tlb1_mapin_region(vm_offset_t va, vm_paddr_t pa, vm_size_t size, int wimge) { vm_offset_t base; vm_size_t mapped, sz, ssize; mapped = 0; base = va; ssize = size; while (size > 0) { sz = 1UL << (ilog2(size) & ~1); /* Align size to PA */ if (pa % sz != 0) { do { sz >>= 2; } while (pa % sz != 0); } /* Now align from there to VA */ if (va % sz != 0) { do { sz >>= 2; } while (va % sz != 0); } #ifdef __powerpc64__ /* * Clamp TLB1 entries to 4G. * * While the e6500 supports up to 1TB mappings, the e5500 * only supports up to 4G mappings. (0b1011) * * If any e6500 machines capable of supporting a very * large amount of memory appear in the future, we can * revisit this. * * For now, though, since we have plenty of space in TLB1, * always avoid creating entries larger than 4GB. */ sz = MIN(sz, 1UL << 32); #endif if (bootverbose) printf("Wiring VA=%p to PA=%jx (size=%lx)\n", (void *)va, (uintmax_t)pa, (long)sz); if (tlb1_set_entry(va, pa, sz, _TLB_ENTRY_SHARED | wimge) < 0) return (mapped); size -= sz; pa += sz; va += sz; } mapped = (va - base); if (bootverbose) printf("mapped size 0x%"PRIxPTR" (wasted space 0x%"PRIxPTR")\n", mapped, mapped - ssize); return (mapped); } /* * TLB1 initialization routine, to be called after the very first * assembler level setup done in locore.S. */ void tlb1_init() { vm_offset_t mas2; uint32_t mas0, mas1, mas3, mas7; uint32_t tsz; tlb1_get_tlbconf(); mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(0); mtspr(SPR_MAS0, mas0); __asm __volatile("isync; tlbre"); mas1 = mfspr(SPR_MAS1); mas2 = mfspr(SPR_MAS2); mas3 = mfspr(SPR_MAS3); mas7 = mfspr(SPR_MAS7); kernload = ((vm_paddr_t)(mas7 & MAS7_RPN) << 32) | (mas3 & MAS3_RPN); tsz = (mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT; kernsize += (tsz > 0) ? tsize2size(tsz) : 0; kernstart = trunc_page(mas2); /* Setup TLB miss defaults */ set_mas4_defaults(); } /* * pmap_early_io_unmap() should be used in short conjunction with * pmap_early_io_map(), as in the following snippet: * * x = pmap_early_io_map(...); * * pmap_early_io_unmap(x, size); * * And avoiding more allocations between. */ void pmap_early_io_unmap(vm_offset_t va, vm_size_t size) { int i; tlb_entry_t e; vm_size_t isize; size = roundup(size, PAGE_SIZE); isize = size; for (i = 0; i < TLB1_ENTRIES && size > 0; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) continue; if (va <= e.virt && (va + isize) >= (e.virt + e.size)) { size -= e.size; e.mas1 &= ~MAS1_VALID; tlb1_write_entry(&e, i); } } if (tlb1_map_base == va + isize) tlb1_map_base -= isize; } vm_offset_t pmap_early_io_map(vm_paddr_t pa, vm_size_t size) { vm_paddr_t pa_base; vm_offset_t va, sz; int i; tlb_entry_t e; KASSERT(!pmap_bootstrapped, ("Do not use after PMAP is up!")); for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) continue; if (pa >= e.phys && (pa + size) <= (e.phys + e.size)) return (e.virt + (pa - e.phys)); } pa_base = rounddown(pa, PAGE_SIZE); size = roundup(size + (pa - pa_base), PAGE_SIZE); tlb1_map_base = roundup2(tlb1_map_base, 1 << (ilog2(size) & ~1)); va = tlb1_map_base + (pa - pa_base); do { sz = 1 << (ilog2(size) & ~1); tlb1_set_entry(tlb1_map_base, pa_base, sz, _TLB_ENTRY_SHARED | _TLB_ENTRY_IO); size -= sz; pa_base += sz; tlb1_map_base += sz; } while (size > 0); return (va); } void pmap_track_page(pmap_t pmap, vm_offset_t va) { vm_paddr_t pa; vm_page_t page; struct pv_entry *pve; va = trunc_page(va); pa = pmap_kextract(va); page = PHYS_TO_VM_PAGE(pa); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); TAILQ_FOREACH(pve, &page->md.pv_list, pv_link) { if ((pmap == pve->pv_pmap) && (va == pve->pv_va)) { goto out; } } page->md.pv_tracked = true; pv_insert(pmap, va, page); out: PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); } /* * Setup MAS4 defaults. * These values are loaded to MAS0-2 on a TLB miss. */ static void set_mas4_defaults(void) { uint32_t mas4; /* Defaults: TLB0, PID0, TSIZED=4K */ mas4 = MAS4_TLBSELD0; mas4 |= (TLB_SIZE_4K << MAS4_TSIZED_SHIFT) & MAS4_TSIZED_MASK; #ifdef SMP mas4 |= MAS4_MD; #endif mtspr(SPR_MAS4, mas4); __asm __volatile("isync"); } /* * Return 0 if the physical IO range is encompassed by one of the * the TLB1 entries, otherwise return related error code. */ static int tlb1_iomapped(int i, vm_paddr_t pa, vm_size_t size, vm_offset_t *va) { uint32_t prot; vm_paddr_t pa_start; vm_paddr_t pa_end; unsigned int entry_tsize; vm_size_t entry_size; tlb_entry_t e; *va = (vm_offset_t)NULL; tlb1_read_entry(&e, i); /* Skip invalid entries */ if (!(e.mas1 & MAS1_VALID)) return (EINVAL); /* * The entry must be cache-inhibited, guarded, and r/w * so it can function as an i/o page */ prot = e.mas2 & (MAS2_I | MAS2_G); if (prot != (MAS2_I | MAS2_G)) return (EPERM); prot = e.mas3 & (MAS3_SR | MAS3_SW); if (prot != (MAS3_SR | MAS3_SW)) return (EPERM); /* The address should be within the entry range. */ entry_tsize = (e.mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT; KASSERT((entry_tsize), ("tlb1_iomapped: invalid entry tsize")); entry_size = tsize2size(entry_tsize); pa_start = (((vm_paddr_t)e.mas7 & MAS7_RPN) << 32) | (e.mas3 & MAS3_RPN); pa_end = pa_start + entry_size; if ((pa < pa_start) || ((pa + size) > pa_end)) return (ERANGE); /* Return virtual address of this mapping. */ *va = (e.mas2 & MAS2_EPN_MASK) + (pa - pa_start); return (0); } #ifdef DDB /* Print out contents of the MAS registers for each TLB0 entry */ static void #ifdef __powerpc64__ tlb_print_entry(int i, uint32_t mas1, uint64_t mas2, uint32_t mas3, #else tlb_print_entry(int i, uint32_t mas1, uint32_t mas2, uint32_t mas3, #endif uint32_t mas7) { int as; char desc[3]; tlbtid_t tid; vm_size_t size; unsigned int tsize; desc[2] = '\0'; if (mas1 & MAS1_VALID) desc[0] = 'V'; else desc[0] = ' '; if (mas1 & MAS1_IPROT) desc[1] = 'P'; else desc[1] = ' '; as = (mas1 & MAS1_TS_MASK) ? 1 : 0; tid = MAS1_GETTID(mas1); tsize = (mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT; size = 0; if (tsize) size = tsize2size(tsize); printf("%3d: (%s) [AS=%d] " "sz = 0x%jx tsz = %d tid = %d mas1 = 0x%08x " "mas2(va) = 0x%"PRI0ptrX" mas3(pa) = 0x%08x mas7 = 0x%08x\n", i, desc, as, (uintmax_t)size, tsize, tid, mas1, mas2, mas3, mas7); } DB_SHOW_COMMAND(tlb0, tlb0_print_tlbentries) { uint32_t mas0, mas1, mas3, mas7; #ifdef __powerpc64__ uint64_t mas2; #else uint32_t mas2; #endif int entryidx, way, idx; printf("TLB0 entries:\n"); for (way = 0; way < TLB0_WAYS; way ++) for (entryidx = 0; entryidx < TLB0_ENTRIES_PER_WAY; entryidx++) { mas0 = MAS0_TLBSEL(0) | MAS0_ESEL(way); mtspr(SPR_MAS0, mas0); mas2 = entryidx << MAS2_TLB0_ENTRY_IDX_SHIFT; mtspr(SPR_MAS2, mas2); __asm __volatile("isync; tlbre"); mas1 = mfspr(SPR_MAS1); mas2 = mfspr(SPR_MAS2); mas3 = mfspr(SPR_MAS3); mas7 = mfspr(SPR_MAS7); idx = tlb0_tableidx(mas2, way); tlb_print_entry(idx, mas1, mas2, mas3, mas7); } } /* * Print out contents of the MAS registers for each TLB1 entry */ DB_SHOW_COMMAND(tlb1, tlb1_print_tlbentries) { uint32_t mas0, mas1, mas3, mas7; #ifdef __powerpc64__ uint64_t mas2; #else uint32_t mas2; #endif int i; printf("TLB1 entries:\n"); for (i = 0; i < TLB1_ENTRIES; i++) { mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(i); mtspr(SPR_MAS0, mas0); __asm __volatile("isync; tlbre"); mas1 = mfspr(SPR_MAS1); mas2 = mfspr(SPR_MAS2); mas3 = mfspr(SPR_MAS3); mas7 = mfspr(SPR_MAS7); tlb_print_entry(i, mas1, mas2, mas3, mas7); } } #endif Index: head/sys/powerpc/powerpc/minidump_machdep.c =================================================================== --- head/sys/powerpc/powerpc/minidump_machdep.c (revision 366710) +++ head/sys/powerpc/powerpc/minidump_machdep.c (revision 366711) @@ -1,396 +1,397 @@ /*- * Copyright (c) 2019 Leandro Lupori * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include /* Debugging stuff */ #define MINIDUMP_DEBUG 0 #if MINIDUMP_DEBUG #define dprintf(fmt, ...) printf(fmt, ## __VA_ARGS__) #define DBG(...) __VA_ARGS__ static size_t total, dumptotal; static void dump_total(const char *id, size_t sz); #else #define dprintf(fmt, ...) #define DBG(...) #define dump_total(...) #endif extern vm_offset_t __startkernel, __endkernel; static int dump_retry_count = 5; SYSCTL_INT(_machdep, OID_AUTO, dump_retry_count, CTLFLAG_RWTUN, &dump_retry_count, 0, "Number of times dump has to retry before bailing out"); static struct kerneldumpheader kdh; static char pgbuf[PAGE_SIZE]; static struct { int min_per; int max_per; int visited; } progress_track[10] = { { 0, 10, 0}, { 10, 20, 0}, { 20, 30, 0}, { 30, 40, 0}, { 40, 50, 0}, { 50, 60, 0}, { 60, 70, 0}, { 70, 80, 0}, { 80, 90, 0}, { 90, 100, 0} }; static size_t counter, dumpsize, progress; /* Handle chunked writes. */ static size_t fragsz; int is_dumpable(vm_paddr_t pa) { vm_page_t m; int i; if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) return ((m->flags & PG_NODUMP) == 0); for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) return (1); } return (0); } static void pmap_kenter_temporary(vm_offset_t va, vm_paddr_t pa) { pmap_kremove(va); pmap_kenter(va, pa); } static void report_progress(void) { int sofar, i; sofar = 100 - ((progress * 100) / dumpsize); for (i = 0; i < nitems(progress_track); i++) { if (sofar < progress_track[i].min_per || sofar > progress_track[i].max_per) continue; if (progress_track[i].visited) return; progress_track[i].visited = 1; printf("..%d%%", sofar); return; } } static int blk_flush(struct dumperinfo *di) { int error; if (fragsz == 0) return (0); error = dump_append(di, crashdumpmap, 0, fragsz); DBG(dumptotal += fragsz;) fragsz = 0; return (error); } static int blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz) { size_t len, maxdumpsz; int error, i, c; maxdumpsz = MIN(di->maxiosize, MAXDUMPPGS * PAGE_SIZE); if (maxdumpsz == 0) /* seatbelt */ maxdumpsz = PAGE_SIZE; error = 0; if ((sz % PAGE_SIZE) != 0) { printf("Size not page aligned\n"); return (EINVAL); } if (ptr != NULL && pa != 0) { printf("Can't have both va and pa!\n"); return (EINVAL); } if ((pa % PAGE_SIZE) != 0) { printf("Address not page aligned 0x%lx\n", pa); return (EINVAL); } if (ptr != NULL) { /* * If we're doing a virtual dump, flush any pre-existing * pa pages */ error = blk_flush(di); if (error) return (error); } while (sz) { len = maxdumpsz - fragsz; if (len > sz) len = sz; counter += len; progress -= len; if (counter >> 20) { report_progress(); counter &= (1<<20) - 1; } if (ptr) { error = dump_append(di, ptr, 0, len); if (error) return (error); DBG(dumptotal += len;) ptr += len; } else { for (i = 0; i < len; i += PAGE_SIZE) pmap_kenter_temporary( (vm_offset_t)crashdumpmap + fragsz + i, pa + i); fragsz += len; pa += len; if (fragsz == maxdumpsz) { error = blk_flush(di); if (error) return (error); } } sz -= len; /* Check for user abort. */ c = cncheckc(); if (c == 0x03) return (ECANCELED); if (c != -1) printf(" (CTRL-C to abort) "); } return (0); } static int dump_pmap(struct dumperinfo *di) { void *ctx; char *buf; u_long nbytes; int error; ctx = dumpsys_dump_pmap_init(sizeof(pgbuf) / PAGE_SIZE); for (;;) { buf = dumpsys_dump_pmap(ctx, pgbuf, &nbytes); if (buf == NULL) break; error = blk_write(di, buf, 0, nbytes); if (error) return (error); } return (0); } int minidumpsys(struct dumperinfo *di) { vm_paddr_t pa; int error, i, retry_count; uint32_t pmapsize; struct minidumphdr mdhdr; retry_count = 0; retry: retry_count++; fragsz = 0; DBG(total = dumptotal = 0;) /* Reset progress */ counter = 0; for (i = 0; i < nitems(progress_track); i++) progress_track[i].visited = 0; /* Build set of dumpable pages from kernel pmap */ pmapsize = dumpsys_scan_pmap(); if (pmapsize % PAGE_SIZE != 0) { printf("pmapsize not page aligned: 0x%x\n", pmapsize); return (EINVAL); } /* Calculate dump size */ dumpsize = PAGE_SIZE; /* header */ dumpsize += round_page(msgbufp->msg_size); dumpsize += round_page(sizeof(dump_avail)); dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages)); dumpsize += pmapsize; VM_PAGE_DUMP_FOREACH(pa) { /* Clear out undumpable pages now if needed */ if (is_dumpable(pa)) dumpsize += PAGE_SIZE; else dump_drop_page(pa); } progress = dumpsize; /* Initialize mdhdr */ bzero(&mdhdr, sizeof(mdhdr)); strcpy(mdhdr.magic, MINIDUMP_MAGIC); strncpy(mdhdr.mmu_name, pmap_mmu_name(), sizeof(mdhdr.mmu_name) - 1); mdhdr.version = MINIDUMP_VERSION; mdhdr.msgbufsize = msgbufp->msg_size; mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); mdhdr.pmapsize = pmapsize; mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS; mdhdr.kernend = VM_MAX_SAFE_KERNEL_ADDRESS; mdhdr.dmapbase = DMAP_BASE_ADDRESS; mdhdr.dmapend = DMAP_MAX_ADDRESS; mdhdr.hw_direct_map = hw_direct_map; mdhdr.startkernel = __startkernel; mdhdr.endkernel = __endkernel; mdhdr.dumpavailsize = round_page(sizeof(dump_avail)); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_POWERPC_VERSION, dumpsize); error = dump_start(di, &kdh); if (error) goto fail; printf("Dumping %lu out of %ju MB:", dumpsize >> 20, ptoa((uintmax_t)physmem) / 1048576); /* Dump minidump header */ bzero(pgbuf, sizeof(pgbuf)); memcpy(pgbuf, &mdhdr, sizeof(mdhdr)); error = blk_write(di, pgbuf, 0, PAGE_SIZE); if (error) goto fail; dump_total("header", PAGE_SIZE); /* Dump msgbuf up front */ error = blk_write(di, (char *)msgbufp->msg_ptr, 0, round_page(msgbufp->msg_size)); dump_total("msgbuf", round_page(msgbufp->msg_size)); /* Dump dump_avail */ _Static_assert(sizeof(dump_avail) <= sizeof(pgbuf), "Large dump_avail not handled"); bzero(pgbuf, sizeof(mdhdr)); memcpy(pgbuf, dump_avail, sizeof(dump_avail)); error = blk_write(di, pgbuf, 0, PAGE_SIZE); if (error) goto fail; dump_total("dump_avail", round_page(sizeof(dump_avail))); /* Dump bitmap */ error = blk_write(di, (char *)vm_page_dump, 0, round_page(BITSET_SIZE(vm_page_dump_pages))); if (error) goto fail; dump_total("bitmap", round_page(BITSET_SIZE(vm_page_dump_pages))); /* Dump kernel page directory pages */ error = dump_pmap(di); if (error) goto fail; dump_total("pmap", pmapsize); /* Dump memory chunks */ VM_PAGE_DUMP_FOREACH(pa) { error = blk_write(di, 0, pa, PAGE_SIZE); if (error) goto fail; } error = blk_flush(di); if (error) goto fail; dump_total("mem_chunks", dumpsize - total); error = dump_finish(di, &kdh); if (error) goto fail; printf("\nDump complete\n"); return (0); fail: if (error < 0) error = -error; printf("\n"); if (error == ENOSPC) { printf("Dump map grown while dumping. "); if (retry_count < dump_retry_count) { printf("Retrying...\n"); goto retry; } printf("Dump failed.\n"); } else if (error == ECANCELED) printf("Dump aborted\n"); else if (error == E2BIG) printf("Dump failed. Partition too small.\n"); else printf("** DUMP FAILED (ERROR %d) **\n", error); return (error); } #if MINIDUMP_DEBUG static void dump_total(const char *id, size_t sz) { total += sz; dprintf("\n%s=%08lx/%08lx/%08lx\n", id, sz, total, dumptotal); } #endif Index: head/sys/powerpc/powerpc/uma_machdep.c =================================================================== --- head/sys/powerpc/powerpc/uma_machdep.c (revision 366710) +++ head/sys/powerpc/powerpc/uma_machdep.c (revision 366711) @@ -1,108 +1,109 @@ /*- * Copyright (c) 2003 The FreeBSD Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include +#include #include #include #include #include +#include #include #include #include #include -#include static int hw_uma_mdpages; SYSCTL_INT(_hw, OID_AUTO, uma_mdpages, CTLFLAG_RD, &hw_uma_mdpages, 0, "UMA MD pages in use"); void * uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, int wait) { void *va; vm_paddr_t pa; vm_page_t m; *flags = UMA_SLAB_PRIV; m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); if (m == NULL) return (NULL); pa = VM_PAGE_TO_PHYS(m); #ifdef __powerpc64__ if ((wait & M_NODUMP) == 0) dump_add_page(pa); #endif if (!hw_direct_map) { pmap_kenter(pa, pa); va = (void *)(vm_offset_t)pa; } else { va = (void *)(vm_offset_t)PHYS_TO_DMAP(pa); } if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero(va, PAGE_SIZE); atomic_add_int(&hw_uma_mdpages, 1); return (va); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; if (hw_direct_map) m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)mem)); else { m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)mem)); pmap_kremove((vm_offset_t)mem); } KASSERT(m != NULL, ("Freeing UMA block at %p with no associated page", mem)); #ifdef __powerpc64__ dump_drop_page(VM_PAGE_TO_PHYS(m)); #endif vm_page_unwire_noq(m); vm_page_free(m); atomic_subtract_int(&hw_uma_mdpages, 1); } Index: head/sys/riscv/riscv/minidump_machdep.c =================================================================== --- head/sys/riscv/riscv/minidump_machdep.c (revision 366710) +++ head/sys/riscv/riscv/minidump_machdep.c (revision 366711) @@ -1,400 +1,401 @@ /*- * Copyright (c) 2006 Peter Wemm * Copyright (c) 2015 The FreeBSD Foundation * All rights reserved. * Copyright (c) 2019 Mitchell Horne * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include CTASSERT(sizeof(struct kerneldumpheader) == 512); static struct kerneldumpheader kdh; /* Handle chunked writes. */ static size_t fragsz; static void *dump_va; static size_t counter, progress, dumpsize; static uint64_t tmpbuffer[PAGE_SIZE / sizeof(uint64_t)]; static struct { int min_per; int max_per; int visited; } progress_track[10] = { { 0, 10, 0}, { 10, 20, 0}, { 20, 30, 0}, { 30, 40, 0}, { 40, 50, 0}, { 50, 60, 0}, { 60, 70, 0}, { 70, 80, 0}, { 80, 90, 0}, { 90, 100, 0} }; static void report_progress(size_t progress, size_t dumpsize) { int sofar, i; sofar = 100 - ((progress * 100) / dumpsize); for (i = 0; i < nitems(progress_track); i++) { if (sofar < progress_track[i].min_per || sofar > progress_track[i].max_per) continue; if (progress_track[i].visited) return; progress_track[i].visited = 1; printf("..%d%%", sofar); return; } } static bool is_dumpable(vm_paddr_t pa) { vm_page_t m; int i; if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL) return ((m->flags & PG_NODUMP) == 0); for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) return (true); } return (false); } static int blk_flush(struct dumperinfo *di) { int error; if (fragsz == 0) return (0); error = dump_append(di, dump_va, 0, fragsz); fragsz = 0; return (error); } /* * Write a block of data to the dump file. * * Caller can provide data through a pointer or by specifying its * physical address. * * XXX writes using pa should be no larger than PAGE_SIZE. */ static int blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz) { size_t len; int error, c; u_int maxdumpsz; maxdumpsz = min(di->maxiosize, MAXDUMPPGS * PAGE_SIZE); if (maxdumpsz == 0) /* seatbelt */ maxdumpsz = PAGE_SIZE; error = 0; if ((sz % PAGE_SIZE) != 0) { printf("size not page aligned\n"); return (EINVAL); } if (ptr != NULL && pa != 0) { printf("cant have both va and pa!\n"); return (EINVAL); } if ((((uintptr_t)pa) % PAGE_SIZE) != 0) { printf("address not page aligned %#lx\n", (uintptr_t)pa); return (EINVAL); } if (ptr != NULL) { /* * If we're doing a virtual dump, flush any * pre-existing pa pages. */ error = blk_flush(di); if (error != 0) return (error); } while (sz) { len = maxdumpsz - fragsz; if (len > sz) len = sz; counter += len; progress -= len; if (counter >> 22) { report_progress(progress, dumpsize); counter &= (1 << 22) - 1; } wdog_kern_pat(WD_LASTVAL); if (ptr) { error = dump_append(di, ptr, 0, len); if (error != 0) return (error); ptr += len; sz -= len; } else { dump_va = (void *)PHYS_TO_DMAP(pa); fragsz += len; pa += len; sz -= len; error = blk_flush(di); if (error != 0) return (error); } /* Check for user abort */ c = cncheckc(); if (c == 0x03) return (ECANCELED); if (c != -1) printf(" (CTRL-C to abort) "); } return (0); } int minidumpsys(struct dumperinfo *di) { pd_entry_t *l1, *l2; pt_entry_t *l3; struct minidumphdr mdhdr; uint32_t pmapsize; vm_offset_t va; vm_paddr_t pa; int error; int i; int retry_count; retry_count = 0; retry: retry_count++; error = 0; pmapsize = 0; /* Build set of dumpable pages from kernel pmap */ for (va = VM_MIN_KERNEL_ADDRESS; va < kernel_vm_end; va += L2_SIZE) { pmapsize += PAGE_SIZE; if (!pmap_get_tables(pmap_kernel(), va, &l1, &l2, &l3)) continue; /* We should always be using the l2 table for kvm */ if (l2 == NULL) continue; /* l2 may be a superpage */ if ((*l2 & PTE_RWX) != 0) { pa = (*l2 >> PTE_PPN1_S) << L2_SHIFT; for (i = 0; i < Ln_ENTRIES; i++, pa += PAGE_SIZE) { if (is_dumpable(pa)) dump_add_page(pa); } } else { for (i = 0; i < Ln_ENTRIES; i++) { if ((l3[i] & PTE_V) == 0) continue; pa = (l3[i] >> PTE_PPN0_S) * PAGE_SIZE; if (is_dumpable(pa)) dump_add_page(pa); } } } /* Calculate dump size */ dumpsize = pmapsize; dumpsize += round_page(msgbufp->msg_size); dumpsize += round_page(sizeof(dump_avail)); dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages)); VM_PAGE_DUMP_FOREACH(pa) { /* Clear out undumpable pages now if needed */ if (is_dumpable(pa)) dumpsize += PAGE_SIZE; else dump_drop_page(pa); } dumpsize += PAGE_SIZE; progress = dumpsize; /* Initialize mdhdr */ bzero(&mdhdr, sizeof(mdhdr)); strcpy(mdhdr.magic, MINIDUMP_MAGIC); mdhdr.version = MINIDUMP_VERSION; mdhdr.msgbufsize = msgbufp->msg_size; mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); mdhdr.pmapsize = pmapsize; mdhdr.kernbase = KERNBASE; mdhdr.dmapphys = DMAP_MIN_PHYSADDR; mdhdr.dmapbase = DMAP_MIN_ADDRESS; mdhdr.dmapend = DMAP_MAX_ADDRESS; mdhdr.dumpavailsize = round_page(sizeof(dump_avail)); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_RISCV_VERSION, dumpsize); error = dump_start(di, &kdh); if (error != 0) goto fail; printf("Dumping %llu out of %ju MB:", (long long)dumpsize >> 20, ptoa((uintmax_t)physmem) / 1048576); /* Dump minidump header */ bzero(&tmpbuffer, sizeof(tmpbuffer)); bcopy(&mdhdr, &tmpbuffer, sizeof(mdhdr)); error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; /* Dump msgbuf up front */ error = blk_write(di, (char *)msgbufp->msg_ptr, 0, round_page(msgbufp->msg_size)); if (error) goto fail; /* Dump dump_avail */ _Static_assert(sizeof(dump_avail) <= sizeof(tmpbuffer), "Large dump_avail not handled"); bzero(tmpbuffer, sizeof(tmpbuffer)); memcpy(tmpbuffer, dump_avail, sizeof(dump_avail)); error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; /* Dump bitmap */ error = blk_write(di, (char *)vm_page_dump, 0, round_page(BITSET_SIZE(vm_page_dump_pages))); if (error) goto fail; /* Dump kernel page directory pages */ bzero(&tmpbuffer, sizeof(tmpbuffer)); for (va = VM_MIN_KERNEL_ADDRESS; va < kernel_vm_end; va += L2_SIZE) { if (!pmap_get_tables(pmap_kernel(), va, &l1, &l2, &l3)) { /* We always write a page, even if it is zero */ error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; /* Flush, in case we reuse tmpbuffer in the same block */ error = blk_flush(di); if (error) goto fail; } else if ((*l2 & PTE_RWX) != 0) { /* Generate fake l3 entries based on the l2 superpage */ for (i = 0; i < Ln_ENTRIES; i++) { tmpbuffer[i] = (*l2 | (i << PTE_PPN0_S)); } /* We always write a page, even if it is zero */ error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; /* Flush, in case we reuse tmpbuffer in the same block */ error = blk_flush(di); if (error) goto fail; bzero(&tmpbuffer, sizeof(tmpbuffer)); } else { pa = (*l2 >> PTE_PPN0_S) * PAGE_SIZE; /* We always write a page, even if it is zero */ error = blk_write(di, NULL, pa, PAGE_SIZE); if (error) goto fail; } } /* Dump memory chunks */ /* XXX cluster it up and use blk_dump() */ VM_PAGE_DUMP_FOREACH(pa) { error = blk_write(di, 0, pa, PAGE_SIZE); if (error) goto fail; } error = blk_flush(di); if (error) goto fail; error = dump_finish(di, &kdh); if (error != 0) goto fail; printf("\nDump complete\n"); return (0); fail: if (error < 0) error = -error; printf("\n"); if (error == ENOSPC) { printf("Dump map grown while dumping. "); if (retry_count < 5) { printf("Retrying...\n"); goto retry; } printf("Dump failed.\n"); } else if (error == ECANCELED) printf("Dump aborted\n"); else if (error == E2BIG) { printf("Dump failed. Partition too small (about %lluMB were " "needed this time).\n", (long long)dumpsize >> 20); } else printf("** DUMP FAILED (ERROR %d) **\n", error); return (error); } Index: head/sys/riscv/riscv/pmap.c =================================================================== --- head/sys/riscv/riscv/pmap.c (revision 366710) +++ head/sys/riscv/riscv/pmap.c (revision 366711) @@ -1,4635 +1,4636 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * Copyright (c) 2014 Andrew Turner * All rights reserved. * Copyright (c) 2014 The FreeBSD Foundation * All rights reserved. * Copyright (c) 2015-2018 Ruslan Bukin * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Portions of this software were developed by Andrew Turner under * sponsorship from The FreeBSD Foundation. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #define NUL1E (Ln_ENTRIES * Ln_ENTRIES) #define NUL2E (Ln_ENTRIES * NUL1E) #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) /* The list of all the user pmaps */ LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ vm_offset_t kernel_vm_end = 0; vm_paddr_t dmap_phys_base; /* The start of the dmap region */ vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ /* This code assumes all L1 DMAP entries will be used */ CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); static struct rwlock_padalign pvh_global_lock; static struct mtx_padalign allpmaps_lock; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM/pmap parameters"); static int superpages_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, CTLFLAG_RDTUN, &superpages_enabled, 0, "Enable support for transparent superpages"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "2MB page mapping counters"); static u_long pmap_l2_demotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, &pmap_l2_demotions, 0, "2MB page demotions"); static u_long pmap_l2_mappings; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, &pmap_l2_mappings, 0, "2MB page mappings"); static u_long pmap_l2_p_failures; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_l2_p_failures, 0, "2MB page promotion failures"); static u_long pmap_l2_promotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, &pmap_l2_promotions, 0, "2MB page promotions"); /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; extern cpuset_t all_harts; /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp); static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); #define pmap_clear(pte) pmap_store(pte, 0) #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) #define pmap_load_clear(pte) pmap_load_store(pte, 0) #define pmap_load(pte) atomic_load_64(pte) #define pmap_store(pte, entry) atomic_store_64(pte, entry) #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) /********************/ /* Inline functions */ /********************/ static __inline void pagecopy(void *s, void *d) { memcpy(d, s, PAGE_SIZE); } static __inline void pagezero(void *p) { bzero(p, PAGE_SIZE); } #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) #define PTE_TO_PHYS(pte) \ ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE) static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_l1[pmap_l1_index(va)]); } static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) { vm_paddr_t phys; pd_entry_t *l2; phys = PTE_TO_PHYS(pmap_load(l1)); l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); return (&l2[pmap_l2_index(va)]); } static __inline pd_entry_t * pmap_l2(pmap_t pmap, vm_offset_t va) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); if ((pmap_load(l1) & PTE_V) == 0) return (NULL); if ((pmap_load(l1) & PTE_RX) != 0) return (NULL); return (pmap_l1_to_l2(l1, va)); } static __inline pt_entry_t * pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) { vm_paddr_t phys; pt_entry_t *l3; phys = PTE_TO_PHYS(pmap_load(l2)); l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); return (&l3[pmap_l3_index(va)]); } static __inline pt_entry_t * pmap_l3(pmap_t pmap, vm_offset_t va) { pd_entry_t *l2; l2 = pmap_l2(pmap, va); if (l2 == NULL) return (NULL); if ((pmap_load(l2) & PTE_V) == 0) return (NULL); if ((pmap_load(l2) & PTE_RX) != 0) return (NULL); return (pmap_l2_to_l3(l2, va)); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } static void pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, pt_entry_t entry) { struct pmap *user_pmap; pd_entry_t *l1; /* Distribute new kernel L1 entry to all the user pmaps */ if (pmap != kernel_pmap) return; mtx_lock(&allpmaps_lock); LIST_FOREACH(user_pmap, &allpmaps, pm_list) { l1 = &user_pmap->pm_l1[l1index]; pmap_store(l1, entry); } mtx_unlock(&allpmaps_lock); } static pt_entry_t * pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, u_int *l2_slot) { pt_entry_t *l2; pd_entry_t *l1; l1 = (pd_entry_t *)l1pt; *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; /* Check locore has used a table L1 map */ KASSERT((l1[*l1_slot] & PTE_RX) == 0, ("Invalid bootstrap L1 table")); /* Find the address of the L2 table */ l2 = (pt_entry_t *)init_pt_va; *l2_slot = pmap_l2_index(va); return (l2); } static vm_paddr_t pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) { u_int l1_slot, l2_slot; pt_entry_t *l2; vm_paddr_t ret; l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); /* Check locore has used L2 superpages */ KASSERT((l2[l2_slot] & PTE_RX) != 0, ("Invalid bootstrap L2 table")); /* L2 is superpages */ ret = (l2[l2_slot] >> PTE_PPN1_S) << L2_SHIFT; ret += (va & L2_OFFSET); return (ret); } static void pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) { vm_offset_t va; vm_paddr_t pa; pd_entry_t *l1; u_int l1_slot; pt_entry_t entry; pn_t pn; pa = dmap_phys_base = min_pa & ~L1_OFFSET; va = DMAP_MIN_ADDRESS; l1 = (pd_entry_t *)kern_l1; l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS); for (; va < DMAP_MAX_ADDRESS && pa < max_pa; pa += L1_SIZE, va += L1_SIZE, l1_slot++) { KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); /* superpages */ pn = (pa / PAGE_SIZE); entry = PTE_KERN; entry |= (pn << PTE_PPN0_S); pmap_store(&l1[l1_slot], entry); } /* Set the upper limit of the DMAP region */ dmap_phys_max = pa; dmap_max_addr = va; sfence_vma(); } static vm_offset_t pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) { vm_offset_t l3pt; pt_entry_t entry; pd_entry_t *l2; vm_paddr_t pa; u_int l2_slot; pn_t pn; KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); l2 = pmap_l2(kernel_pmap, va); l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1)); l2_slot = pmap_l2_index(va); l3pt = l3_start; for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); pa = pmap_early_vtophys(l1pt, l3pt); pn = (pa / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(&l2[l2_slot], entry); l3pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l3_start, 0, l3pt - l3_start); return (l3pt); } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) { u_int l1_slot, l2_slot; vm_offset_t freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t max_pa, min_pa, pa; pt_entry_t *l2p; int i; printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); /* Set this early so we can use the pagetable walking functions */ kernel_pmap_store.pm_l1 = (pd_entry_t *)l1pt; PMAP_LOCK_INIT(kernel_pmap); rw_init(&pvh_global_lock, "pmap pv global"); CPU_FILL(&kernel_pmap->pm_active); /* Assume the address we were loaded to is a valid physical address. */ min_pa = max_pa = kernstart; physmap_idx = physmem_avail(physmap, nitems(physmap)); physmap_idx /= 2; /* * Find the minimum physical address. physmap is sorted, * but may contain empty ranges. */ for (i = 0; i < physmap_idx * 2; i += 2) { if (physmap[i] == physmap[i + 1]) continue; if (physmap[i] <= min_pa) min_pa = physmap[i]; if (physmap[i + 1] > max_pa) max_pa = physmap[i + 1]; } printf("physmap_idx %u\n", physmap_idx); printf("min_pa %lx\n", min_pa); printf("max_pa %lx\n", max_pa); /* Create a direct map region early so we can use it for pa -> va */ pmap_bootstrap_dmap(l1pt, min_pa, max_pa); /* * Read the page table to find out what is already mapped. * This assumes we have mapped a block of memory from KERNBASE * using a single L1 entry. */ (void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); /* Sanity check the index, KERNBASE should be the first VA */ KASSERT(l2_slot == 0, ("The L2 index is non-zero")); freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE); /* Create the l3 tables for the early devmap */ freemempos = pmap_bootstrap_l3(l1pt, VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); /* * Invalidate the mapping we created for the DTB. At this point a copy * has been created, and we no longer need it. We want to avoid the * possibility of an aliased mapping in the future. */ l2p = pmap_l2(kernel_pmap, VM_EARLY_DTB_ADDRESS); if ((pmap_load(l2p) & PTE_V) != 0) pmap_clear(l2p); sfence_vma(); #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += (np * PAGE_SIZE); \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); /* Allocate dynamic per-cpu area. */ alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); dpcpu_init((void *)dpcpu, 0); /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); msgbufp = (void *)msgbufpv; virtual_avail = roundup2(freemempos, L2_SIZE); virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; kernel_vm_end = virtual_avail; pa = pmap_early_vtophys(l1pt, freemempos); physmem_exclude_region(kernstart, pa - kernstart, EXFLAG_NOALLOC); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { vm_size_t s; int i, pv_npg; /* * Initialize the pv chunk and pmap list mutexes. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); if (superpages_enabled) pagesizes[1] = L2_SIZE; } #ifdef SMP /* * For SMP, these functions have to use IPIs for coherence. * * In general, the calling thread uses a plain fence to order the * writes to the page tables before invoking an SBI callback to invoke * sfence_vma() on remote CPUs. */ static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { cpuset_t mask; sched_pin(); mask = pmap->pm_active; CPU_CLR(PCPU_GET(hart), &mask); fence(); if (!CPU_EMPTY(&mask) && smp_started) sbi_remote_sfence_vma(mask.__bits, va, 1); sfence_vma_page(va); sched_unpin(); } static void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t mask; sched_pin(); mask = pmap->pm_active; CPU_CLR(PCPU_GET(hart), &mask); fence(); if (!CPU_EMPTY(&mask) && smp_started) sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); /* * Might consider a loop of sfence_vma_page() for a small * number of pages in the future. */ sfence_vma(); sched_unpin(); } static void pmap_invalidate_all(pmap_t pmap) { cpuset_t mask; sched_pin(); mask = pmap->pm_active; CPU_CLR(PCPU_GET(hart), &mask); /* * XXX: The SBI doc doesn't detail how to specify x0 as the * address to perform a global fence. BBL currently treats * all sfence_vma requests as global however. */ fence(); if (!CPU_EMPTY(&mask) && smp_started) sbi_remote_sfence_vma(mask.__bits, 0, 0); sfence_vma(); sched_unpin(); } #else /* * Normal, non-SMP, invalidation functions. * We inline these within pmap.c for speed. */ static __inline void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { sfence_vma_page(va); } static __inline void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { /* * Might consider a loop of sfence_vma_page() for a small * number of pages in the future. */ sfence_vma(); } static __inline void pmap_invalidate_all(pmap_t pmap) { sfence_vma(); } #endif /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pd_entry_t *l2p, l2; pt_entry_t *l3p, l3; vm_paddr_t pa; pa = 0; PMAP_LOCK(pmap); /* * Start with the l2 tabel. We are unable to allocate * pages in the l1 table. */ l2p = pmap_l2(pmap, va); if (l2p != NULL) { l2 = pmap_load(l2p); if ((l2 & PTE_RX) == 0) { l3p = pmap_l2_to_l3(l2p, va); if (l3p != NULL) { l3 = pmap_load(l3p); pa = PTE_TO_PHYS(l3); pa |= (va & L3_OFFSET); } } else { /* L2 is superpages */ pa = (l2 >> PTE_PPN1_S) << L2_SHIFT; pa |= (va & L2_OFFSET); } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t *l3p, l3; vm_paddr_t phys; vm_page_t m; m = NULL; PMAP_LOCK(pmap); l3p = pmap_l3(pmap, va); if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { phys = PTE_TO_PHYS(l3); m = PHYS_TO_VM_PAGE(phys); if (!vm_page_wire_mapped(m)) m = NULL; } } PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pd_entry_t *l2; pt_entry_t *l3; vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { l2 = pmap_l2(kernel_pmap, va); if (l2 == NULL) panic("pmap_kextract: No l2"); if ((pmap_load(l2) & PTE_RX) != 0) { /* superpages */ pa = (pmap_load(l2) >> PTE_PPN1_S) << L2_SHIFT; pa |= (va & L2_OFFSET); return (pa); } l3 = pmap_l2_to_l3(l2, va); if (l3 == NULL) panic("pmap_kextract: No l3..."); pa = PTE_TO_PHYS(pmap_load(l3)); pa |= (va & PAGE_MASK); } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ void pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) { pt_entry_t entry; pt_entry_t *l3; vm_offset_t va; pn_t pn; KASSERT((pa & L3_OFFSET) == 0, ("pmap_kenter_device: Invalid physical address")); KASSERT((sva & L3_OFFSET) == 0, ("pmap_kenter_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kenter_device: Mapping is not page-sized")); va = sva; while (size != 0) { l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); pn = (pa / PAGE_SIZE); entry = PTE_KERN; entry |= (pn << PTE_PPN0_S); pmap_store(l3, entry); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *l3; l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); pmap_clear(l3); sfence_vma(); } void pmap_kremove_device(vm_offset_t sva, vm_size_t size) { pt_entry_t *l3; vm_offset_t va; KASSERT((sva & L3_OFFSET) == 0, ("pmap_kremove_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kremove_device: Mapping is not page-sized")); va = sva; while (size != 0) { l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); pmap_clear(l3); va += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *l3, pa; vm_offset_t va; vm_page_t m; pt_entry_t entry; pn_t pn; int i; va = sva; for (i = 0; i < count; i++) { m = ma[i]; pa = VM_PAGE_TO_PHYS(m); pn = (pa / PAGE_SIZE); l3 = pmap_l3(kernel_pmap, va); entry = PTE_KERN; entry |= (pn << PTE_PPN0_S); pmap_store(l3, entry); va += L3_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { pt_entry_t *l3; vm_offset_t va; KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); for (va = sva; count-- > 0; va += PAGE_SIZE) { l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); pmap_clear(l3); } pmap_invalidate_range(kernel_pmap, sva, va); } bool pmap_ps_enabled(pmap_t pmap __unused) { return (superpages_enabled); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "ml3" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); ml3->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, ml3)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); } /* * Decrements a page table page's reference count, which is used to record the * number of valid page table entries within the page. If the reference count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->ref_count; if (m->ref_count == 0) { _pmap_unwire_ptp(pmap, va, m, free); return (TRUE); } else { return (FALSE); } } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { vm_paddr_t phys; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (m->pindex >= NUL1E) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); pmap_clear(l1); pmap_distribute_l1(pmap, pmap_l1_index(va), 0); } else { pd_entry_t *l2; l2 = pmap_l2(pmap, va); pmap_clear(l2); } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUL1E) { pd_entry_t *l1; vm_page_t pdpg; l1 = pmap_l1(pmap, va); phys = PTE_TO_PHYS(pmap_load(l1)); pdpg = PHYS_TO_VM_PAGE(phys); pmap_unwire_ptp(pmap, va, pdpg, free); } pmap_invalidate_page(pmap, va); vm_wire_sub(1); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the reference count. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); return (pmap_unwire_ptp(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_l1 = kernel_pmap->pm_l1; pmap->pm_satp = SATP_MODE_SV39 | (vtophys(pmap->pm_l1) >> PAGE_SHIFT); CPU_ZERO(&pmap->pm_active); pmap_activate_boot(pmap); } int pmap_pinit(pmap_t pmap) { vm_paddr_t l1phys; vm_page_t l1pt; /* * allocate the l1 page */ while ((l1pt = vm_page_alloc(NULL, 0xdeadbeef, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) vm_wait(NULL); l1phys = VM_PAGE_TO_PHYS(l1pt); pmap->pm_l1 = (pd_entry_t *)PHYS_TO_DMAP(l1phys); pmap->pm_satp = SATP_MODE_SV39 | (l1phys >> PAGE_SHIFT); if ((l1pt->flags & PG_ZERO) == 0) pagezero(pmap->pm_l1); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); CPU_ZERO(&pmap->pm_active); /* Install kernel pagetables */ memcpy(pmap->pm_l1, kernel_pmap->pm_l1, PAGE_SIZE); /* Add to the list of all user pmaps */ mtx_lock(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock(&allpmaps_lock); vm_radix_init(&pmap->pm_root); return (1); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, /*pdppg, */pdpg; pt_entry_t entry; vm_paddr_t phys; pn_t pn; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); rw_runlock(&pvh_global_lock); vm_wait(NULL); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= NUL1E) { pd_entry_t *l1; vm_pindex_t l1index; l1index = ptepindex - NUL1E; l1 = &pmap->pm_l1[l1index]; pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(l1, entry); pmap_distribute_l1(pmap, l1index, entry); } else { vm_pindex_t l1index; pd_entry_t *l1, *l2; l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); l1 = &pmap->pm_l1[l1index]; if (pmap_load(l1) == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL1E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { phys = PTE_TO_PHYS(pmap_load(l1)); pdpg = PHYS_TO_VM_PAGE(phys); pdpg->ref_count++; } phys = PTE_TO_PHYS(pmap_load(l1)); l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); l2 = &l2[ptepindex & Ln_ADDR_MASK]; pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(l2, entry); } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { pd_entry_t *l1; vm_page_t l2pg; vm_pindex_t l2pindex; retry: l1 = pmap_l1(pmap, va); if (l1 != NULL && (pmap_load(l1) & PTE_RWX) == 0) { /* Add a reference to the L2 page. */ l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); l2pg->ref_count++; } else { /* Allocate a L2 page. */ l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); if (l2pg == NULL && lockp != NULL) goto retry; } return (l2pg); } static vm_page_t pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *l2; vm_paddr_t phys; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = pmap_l2_pindex(va); retry: /* * Get the page directory entry */ l2 = pmap_l2(pmap, va); /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (l2 != NULL && pmap_load(l2) != 0) { phys = PTE_TO_PHYS(pmap_load(l2)); m = PHYS_TO_VM_PAGE(phys); m->ref_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_alloc_l3(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); mtx_lock(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock(&allpmaps_lock); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l1)); vm_page_unwire_noq(m); vm_page_free(m); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *l1, *l2; pt_entry_t entry; pn_t pn; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, L2_SIZE); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { l1 = pmap_l1(kernel_pmap, kernel_vm_end); if (pmap_load(l1) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); pn = (paddr / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(l1, entry); pmap_distribute_l1(kernel_pmap, pmap_l1_index(kernel_vm_end), entry); continue; /* try again */ } l2 = pmap_l1_to_l2(l1, kernel_vm_end); if ((pmap_load(l2) & PTE_V) != 0 && (pmap_load(l2) & PTE_RWX) == 0) { kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) { pmap_zero_page(nkpg); } paddr = VM_PAGE_TO_PHYS(nkpg); pn = (paddr / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(l2, entry); pmap_invalidate_page(kernel_pmap, kernel_vm_end); kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #if 0 #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif #endif /* 0 */ /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { panic("RISCVTODO: reclaim_pv_chunk"); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; vm_page_t m; int avail, free; bool reclaimed; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } /* XXX PV STATS */ #if 0 dump_add_page(m->phys_addr); #endif pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_LOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void __unused pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_page_t m; vm_offset_t va_last; int bit, field; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); va &= ~L2_OFFSET; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining 511 pv entries. */ va_last = va + L2_SIZE - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field] != 0) { bit = ffsl(pc->pc_map[field]) - 1; pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_l2: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } /* XXX PV stats */ } #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_page_t m; vm_offset_t va_last; rw_assert(&pvh_global_lock, RA_LOCKED); KASSERT((va & L2_OFFSET) == 0, ("pmap_pv_promote_l2: misaligned va %#lx", va)); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); m = PHYS_TO_VM_PAGE(pa); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; va_last = va + L2_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = PTE_TO_PHYS(l2e); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); } static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { pt_entry_t newl2, oldl2; vm_page_t ml3; vm_paddr_t ml3pa; KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); ml3 = pmap_remove_pt_page(pmap, va); if (ml3 == NULL) panic("pmap_remove_kernel_l2: Missing pt page"); ml3pa = VM_PAGE_TO_PHYS(ml3); newl2 = ml3pa | PTE_V; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (ml3->valid != 0) pagezero((void *)PHYS_TO_DMAP(ml3pa)); /* * Demote the mapping. */ oldl2 = pmap_load_store(l2, newl2); KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", __func__, l2, oldl2)); } /* * pmap_remove_l2: Do the things to unmap a level 2 superpage. */ static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t oldl2; vm_offset_t eva, va; vm_page_t m, ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); oldl2 = pmap_load_clear(l2); KASSERT((oldl2 & PTE_RWX) != 0, ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); /* * The sfence.vma documentation states that it is sufficient to specify * a single address within a superpage mapping. However, since we do * not perform any invalidation upon promotion, TLBs may still be * caching 4KB mappings within the superpage, so we must invalidate the * entire range. */ pmap_invalidate_range(pmap, sva, sva + L2_SIZE); if ((oldl2 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); if ((oldl2 & PTE_SW_MANAGED) != 0) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); pmap_pvh_free(pvh, pmap, sva); eva = sva + L2_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2)); va < eva; va += PAGE_SIZE, m++) { if ((oldl2 & PTE_D) != 0) vm_page_dirty(m); if ((oldl2 & PTE_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_l2(pmap, l2, sva); } else { ml3 = pmap_remove_pt_page(pmap, sva); if (ml3 != NULL) { KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_l2: l3 page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(ml3->ref_count == Ln_ENTRIES, ("pmap_remove_l2: l3 page ref count error")); ml3->ref_count = 1; vm_page_unwire_noq(ml3); pmap_add_delayed_free_list(ml3, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, l1e, free)); } /* * pmap_remove_l3: do the things to unmap a page in a process */ static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l3; vm_paddr_t phys; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); old_l3 = pmap_load_clear(l3); pmap_invalidate_page(pmap, va); if (old_l3 & PTE_SW_WIRED) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (old_l3 & PTE_SW_MANAGED) { phys = PTE_TO_PHYS(old_l3); m = PHYS_TO_VM_PAGE(phys); if ((old_l3 & PTE_D) != 0) vm_page_dirty(m); if (old_l3 & PTE_A) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } return (pmap_unuse_pt(pmap, va, l2e, free)); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct spglist free; struct rwlock *lock; vm_offset_t va, va_next; pd_entry_t *l1, *l2, l2e; pt_entry_t *l3; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; SLIST_INIT(&free); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; l1 = pmap_l1(pmap, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL) continue; if ((l2e = pmap_load(l2)) == 0) continue; if ((l2e & PTE_RWX) != 0) { if (sva + L2_SIZE == va_next && eva >= va_next) { (void)pmap_remove_l2(pmap, l2, sva, pmap_load(l1), &free, &lock); continue; } else if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { /* * The large page mapping was destroyed. */ continue; } l2e = pmap_load(l2); } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; va = va_next; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (pmap_load(l3) == 0) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } if (va == va_next) va = sva; if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { sva += L3_SIZE; break; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, false); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct spglist free; struct md_page *pvh; pmap_t pmap; pt_entry_t *l3, l3e; pd_entry_t *l2, l2e; pv_entry_t pv; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); rw_wlock(&pvh_global_lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); va = pv->pv_va; l2 = pmap_l2(pmap, va); (void)pmap_demote_l2(pmap, l2, va); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap_resident_count_dec(pmap, 1); l2 = pmap_l2(pmap, pv->pv_va); KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); l2e = pmap_load(l2); KASSERT((l2e & PTE_RX) == 0, ("pmap_remove_all: found a superpage in %p's pv list", m)); l3 = pmap_l2_to_l3(l2, pv->pv_va); l3e = pmap_load_clear(l3); pmap_invalidate_page(pmap, pv->pv_va); if (l3e & PTE_SW_WIRED) pmap->pm_stats.wired_count--; if ((l3e & PTE_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((l3e & PTE_D) != 0) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); vm_page_free_pages_toq(&free, false); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { pd_entry_t *l1, *l2, l2e; pt_entry_t *l3, l3e, mask; vm_page_t m, mt; vm_paddr_t pa; vm_offset_t va_next; bool anychanged, pv_lists_locked; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) return; anychanged = false; pv_lists_locked = false; mask = 0; if ((prot & VM_PROT_WRITE) == 0) mask |= PTE_W | PTE_D; if ((prot & VM_PROT_EXECUTE) == 0) mask |= PTE_X; resume: PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1 = pmap_l1(pmap, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL || (l2e = pmap_load(l2)) == 0) continue; if ((l2e & PTE_RWX) != 0) { if (sva + L2_SIZE == va_next && eva >= va_next) { retryl2: if ((prot & VM_PROT_WRITE) == 0 && (l2e & (PTE_SW_MANAGED | PTE_D)) == (PTE_SW_MANAGED | PTE_D)) { pa = PTE_TO_PHYS(l2e); m = PHYS_TO_VM_PAGE(pa); for (mt = m; mt < &m[Ln_ENTRIES]; mt++) vm_page_dirty(mt); } if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) goto retryl2; anychanged = true; continue; } else { if (!pv_lists_locked) { pv_lists_locked = true; if (!rw_try_rlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all( pmap); PMAP_UNLOCK(pmap); rw_rlock(&pvh_global_lock); goto resume; } } if (!pmap_demote_l2(pmap, l2, sva)) { /* * The large page mapping was destroyed. */ continue; } } } if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { l3e = pmap_load(l3); retryl3: if ((l3e & PTE_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0 && (l3e & (PTE_SW_MANAGED | PTE_D)) == (PTE_SW_MANAGED | PTE_D)) { m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e)); vm_page_dirty(m); } if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) goto retryl3; anychanged = true; } } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } int pmap_fault_fixup(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) { pd_entry_t *l2, l2e; pt_entry_t bits, *pte, oldpte; int rv; rv = 0; PMAP_LOCK(pmap); l2 = pmap_l2(pmap, va); if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) goto done; if ((l2e & PTE_RWX) == 0) { pte = pmap_l2_to_l3(l2, va); if (pte == NULL || ((oldpte = pmap_load(pte)) & PTE_V) == 0) goto done; } else { pte = l2; oldpte = l2e; } if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) goto done; bits = PTE_A; if (ftype == VM_PROT_WRITE) bits |= PTE_D; /* * Spurious faults can occur if the implementation caches invalid * entries in the TLB, or if simultaneous accesses on multiple CPUs * race with each other. */ if ((oldpte & bits) != bits) pmap_store_bits(pte, bits); sfence_vma(); rv = 1; done: PMAP_UNLOCK(pmap); return (rv); } static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) { struct rwlock *lock; bool rv; lock = NULL; rv = pmap_demote_l2_locked(pmap, l2, va, &lock); if (lock != NULL) rw_wunlock(lock); return (rv); } /* * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page * mapping is invalidated. */ static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { struct spglist free; vm_page_t mpte; pd_entry_t newl2, oldl2; pt_entry_t *firstl3, newl3; vm_paddr_t mptepa; int i; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldl2 = pmap_load(l2); KASSERT((oldl2 & PTE_RWX) != 0, ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == NULL) { if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc(NULL, pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, pmap_load(pmap_l1(pmap, va)), &free, lockp); vm_page_free_pages_toq(&free, true); CTR2(KTR_PMAP, "pmap_demote_l2_locked: " "failure for va %#lx in pmap %p", va, pmap); return (false); } if (va < VM_MAXUSER_ADDRESS) { mpte->ref_count = Ln_ENTRIES; pmap_resident_count_inc(pmap, 1); } } mptepa = VM_PAGE_TO_PHYS(mpte); firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; KASSERT((oldl2 & PTE_A) != 0, ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); newl3 = oldl2; /* * If the page table page is not leftover from an earlier promotion, * initialize it. */ if (mpte->valid == 0) { for (i = 0; i < Ln_ENTRIES; i++) pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); } KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " "addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) for (i = 0; i < Ln_ENTRIES; i++) pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the L2 entry. Otherwise, the * state of the L2 entry and the PV lists will be inconsistent, which * can result in reclaim_pv_chunk() attempting to remove a PV entry from * the wrong PV list and pmap_pv_demote_l2() failing to find the * expected PV entry for the 2MB page mapping that is being demoted. */ if ((oldl2 & PTE_SW_MANAGED) != 0) reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); /* * Demote the mapping. */ pmap_store(l2, newl2); /* * Demote the PV entry. */ if ((oldl2 & PTE_SW_MANAGED) != 0) pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); atomic_add_long(&pmap_l2_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", va, pmap); return (true); } #if VM_NRESERVLEVEL > 0 static void pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *firstl3, *l3; vm_paddr_t pa; vm_page_t ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); va &= ~L2_OFFSET; KASSERT((pmap_load(l2) & PTE_RWX) == 0, ("pmap_promote_l2: invalid l2 entry %p", l2)); firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); pa = PTE_TO_PHYS(pmap_load(firstl3)); if ((pa & L2_OFFSET) != 0) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); return; } pa += PAGE_SIZE; for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) { if (PTE_TO_PHYS(pmap_load(l3)) != pa) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); return; } if ((pmap_load(l3) & PTE_PROMOTE) != (pmap_load(firstl3) & PTE_PROMOTE)) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); return; } pa += PAGE_SIZE; } ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); KASSERT(ml3->pindex == pmap_l2_pindex(va), ("pmap_promote_l2: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, ml3, true)) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); return; } if ((pmap_load(firstl3) & PTE_SW_MANAGED) != 0) pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(pmap_load(firstl3)), lockp); pmap_store(l2, pmap_load(firstl3)); atomic_add_long(&pmap_l2_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, pmap); } #endif /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pd_entry_t *l1, *l2, l2e; pt_entry_t new_l3, orig_l3; pt_entry_t *l3; pv_entry_t pv; vm_paddr_t opa, pa, l2_pa, l3_pa; vm_page_t mpte, om, l2_m, l3_m; pt_entry_t entry; pn_t l2_pn, l3_pn, pn; int rv; bool nosleep; va = trunc_page(va); if ((m->oflags & VPO_UNMANAGED) == 0) VM_PAGE_OBJECT_BUSY_ASSERT(m); pa = VM_PAGE_TO_PHYS(m); pn = (pa / PAGE_SIZE); new_l3 = PTE_V | PTE_R | PTE_A; if (prot & VM_PROT_EXECUTE) new_l3 |= PTE_X; if (flags & VM_PROT_WRITE) new_l3 |= PTE_D; if (prot & VM_PROT_WRITE) new_l3 |= PTE_W; if (va < VM_MAX_USER_ADDRESS) new_l3 |= PTE_U; new_l3 |= (pn << PTE_PPN0_S); if ((flags & PMAP_ENTER_WIRED) != 0) new_l3 |= PTE_SW_WIRED; /* * Set modified bit gratuitously for writeable mappings if * the page is unmanaged. We do not want to take a fault * to do the dirty bit accounting for these mappings. */ if ((m->oflags & VPO_UNMANAGED) != 0) { if (prot & VM_PROT_WRITE) new_l3 |= PTE_D; } else new_l3 |= PTE_SW_MANAGED; CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); lock = NULL; mpte = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va %#lx unaligned", va)); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); goto out; } l2 = pmap_l2(pmap, va); if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, va, &lock))) { l3 = pmap_l2_to_l3(l2, va); if (va < VM_MAXUSER_ADDRESS) { mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); mpte->ref_count++; } } else if (va < VM_MAXUSER_ADDRESS) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } l3 = pmap_l3(pmap, va); } else { l3 = pmap_l3(pmap, va); /* TODO: This is not optimal, but should mostly work */ if (l3 == NULL) { if (l2 == NULL) { l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l2_m == NULL) panic("pmap_enter: l2 pte_m == NULL"); if ((l2_m->flags & PG_ZERO) == 0) pmap_zero_page(l2_m); l2_pa = VM_PAGE_TO_PHYS(l2_m); l2_pn = (l2_pa / PAGE_SIZE); l1 = pmap_l1(pmap, va); entry = (PTE_V); entry |= (l2_pn << PTE_PPN0_S); pmap_store(l1, entry); pmap_distribute_l1(pmap, pmap_l1_index(va), entry); l2 = pmap_l1_to_l2(l1, va); } l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l3_m == NULL) panic("pmap_enter: l3 pte_m == NULL"); if ((l3_m->flags & PG_ZERO) == 0) pmap_zero_page(l3_m); l3_pa = VM_PAGE_TO_PHYS(l3_m); l3_pn = (l3_pa / PAGE_SIZE); entry = (PTE_V); entry |= (l3_pn << PTE_PPN0_S); pmap_store(l2, entry); l3 = pmap_l2_to_l3(l2, va); } pmap_invalidate_page(pmap, va); } orig_l3 = pmap_load(l3); opa = PTE_TO_PHYS(orig_l3); pv = NULL; /* * Is the specified virtual address already mapped? */ if ((orig_l3 & PTE_V) != 0) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((flags & PMAP_ENTER_WIRED) != 0 && (orig_l3 & PTE_SW_WIRED) == 0) pmap->pm_stats.wired_count++; else if ((flags & PMAP_ENTER_WIRED) == 0 && (orig_l3 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->ref_count--; KASSERT(mpte->ref_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((orig_l3 & PTE_SW_MANAGED) != 0 && (new_l3 & PTE_W) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. This ensures that all threads sharing the * pmap keep a consistent view of the mapping, which is * necessary for the correct handling of COW faults. It * also permits reuse of the old mapping's PV entry, * avoiding an allocation. * * For consistency, handle unmanaged mappings the same way. */ orig_l3 = pmap_load_clear(l3); KASSERT(PTE_TO_PHYS(orig_l3) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((orig_l3 & PTE_SW_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if ((orig_l3 & PTE_D) != 0) vm_page_dirty(om); if ((orig_l3 & PTE_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); KASSERT(pv != NULL, ("pmap_enter: no PV entry for %#lx", va)); if ((new_l3 & PTE_SW_MANAGED) == 0) free_pv_entry(pmap, pv); if ((om->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } pmap_invalidate_page(pmap, va); orig_l3 = 0; } else { /* * Increment the counters. */ if ((new_l3 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((new_l3 & PTE_SW_MANAGED) != 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((new_l3 & PTE_W) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } validate: /* * Sync the i-cache on all harts before updating the PTE * if the new PTE is executable. */ if (prot & VM_PROT_EXECUTE) pmap_sync_icache(pmap, va, PAGE_SIZE); /* * Update the L3 entry. */ if (orig_l3 != 0) { orig_l3 = pmap_load_store(l3, new_l3); pmap_invalidate_page(pmap, va); KASSERT(PTE_TO_PHYS(orig_l3) == pa, ("pmap_enter: invalid update")); if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == (PTE_D | PTE_SW_MANAGED)) vm_page_dirty(m); } else { pmap_store(l3, new_l3); } #if VM_NRESERVLEVEL > 0 if (mpte != NULL && mpte->ref_count == Ln_ENTRIES && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_l2(pmap, l2, va, &lock); #endif rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t new_l2; pn_t pn; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); if ((m->oflags & VPO_UNMANAGED) == 0) new_l2 |= PTE_SW_MANAGED; if ((prot & VM_PROT_EXECUTE) != 0) new_l2 |= PTE_X; if (va < VM_MAXUSER_ADDRESS) new_l2 |= PTE_U; return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pd_entry_t *l2, *l3, oldl2; vm_offset_t sva; vm_page_t l2pg, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); l2 = &l2[pmap_l2_index(va)]; if ((oldl2 = pmap_load(l2)) != 0) { KASSERT(l2pg->ref_count > 1, ("pmap_enter_l2: l2pg's ref count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { l2pg->ref_count--; CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_FAILURE); } SLIST_INIT(&free); if ((oldl2 & PTE_RWX) != 0) (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, lockp); else for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { l3 = pmap_l2_to_l3(l2, sva); if ((pmap_load(l3) & PTE_V) != 0 && pmap_remove_l3(pmap, l3, sva, oldl2, &free, lockp) != 0) break; } vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { /* * Both pmap_remove_l2() and pmap_remove_l3() will * leave the kernel page table page zero filled. */ mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_l2: trie insert failed"); } else KASSERT(pmap_load(l2) == 0, ("pmap_enter_l2: non-zero L2 entry %p", l2)); } if ((new_l2 & PTE_SW_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { /* * Although "va" is not mapped, paging-structure * caches could nonetheless have entries that * refer to the freed page table pages. * Invalidate those entries. */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((new_l2 & PTE_W) != 0) for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } /* * Increment counters. */ if ((new_l2 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; /* * Map the superpage. */ pmap_store(l2, new_l2); atomic_add_long(&pmap_l2_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[L2_SIZE / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; vm_paddr_t phys; pd_entry_t *l2; pt_entry_t *l3, newl3; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t l2pindex; /* * Calculate pagetable page index */ l2pindex = pmap_l2_pindex(va); if (mpte && (mpte->pindex == l2pindex)) { mpte->ref_count++; } else { /* * Get the l2 entry */ l2 = pmap_l2(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (l2 != NULL && pmap_load(l2) != 0) { phys = PTE_TO_PHYS(pmap_load(l2)); mpte = PHYS_TO_VM_PAGE(phys); mpte->ref_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); if (mpte == NULL) return (mpte); } } l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); l3 = &l3[pmap_l3_index(va)]; } else { mpte = NULL; l3 = pmap_l3(kernel_pmap, va); } if (l3 == NULL) panic("pmap_enter_quick_locked: No l3"); if (pmap_load(l3) != 0) { if (mpte != NULL) { mpte->ref_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, mpte, &free)) { pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, false); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | PTE_V | PTE_R; if ((prot & VM_PROT_EXECUTE) != 0) newl3 |= PTE_X; if ((m->oflags & VPO_UNMANAGED) == 0) newl3 |= PTE_SW_MANAGED; if (va < VM_MAX_USER_ADDRESS) newl3 |= PTE_U; /* * Sync the i-cache on all harts before updating the PTE * if the new PTE is executable. */ if (prot & VM_PROT_EXECUTE) pmap_sync_icache(pmap, va, PAGE_SIZE); pmap_store(l3, newl3); pmap_invalidate_page(pmap, va); return (mpte); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pd_entry_t *l1, *l2, l2e; pt_entry_t *l3, l3e; bool pv_lists_locked; pv_lists_locked = false; retry: PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1 = pmap_l1(pmap, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if ((l2e = pmap_load(l2)) == 0) continue; if ((l2e & PTE_RWX) != 0) { if (sva + L2_SIZE == va_next && eva >= va_next) { if ((l2e & PTE_SW_WIRED) == 0) panic("pmap_unwire: l2 %#jx is missing " "PTE_SW_WIRED", (uintmax_t)l2e); pmap_clear_bits(l2, PTE_SW_WIRED); continue; } else { if (!pv_lists_locked) { pv_lists_locked = true; if (!rw_try_rlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); rw_rlock(&pvh_global_lock); /* Repeat sva. */ goto retry; } } if (!pmap_demote_l2(pmap, l2, sva)) panic("pmap_unwire: demotion failed"); } } if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if ((l3e = pmap_load(l3)) == 0) continue; if ((l3e & PTE_SW_WIRED) == 0) panic("pmap_unwire: l3 %#jx is missing " "PTE_SW_WIRED", (uintmax_t)l3e); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ pmap_clear_bits(l3, PTE_SW_WIRED); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t m_a, m_b; vm_paddr_t p_a, p_b; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; m_a = ma[a_offset >> PAGE_SHIFT]; p_a = m_a->phys_addr; b_pg_offset = b_offset & PAGE_MASK; m_b = mb[b_offset >> PAGE_SHIFT]; p_b = m_b->phys_addr; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); if (__predict_false(!PHYS_IN_DMAP(p_a))) { panic("!DMAP a %lx", p_a); } else { a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; } if (__predict_false(!PHYS_IN_DMAP(p_b))) { panic("!DMAP b %lx", p_b); } else { b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; } bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } void pmap_quick_remove_page(vm_offset_t addr) { } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); rw_runlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pd_entry_t *l2; pt_entry_t *l3; pv_entry_t pv; int count, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l3 = pmap_l3(pmap, pv->pv_va); if ((pmap_load(l3) & PTE_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); if ((pmap_load(l2) & PTE_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); rw_runlock(&pvh_global_lock); return (count); } /* * Returns true if the given page is mapped individually or as part of * a 2mpage. Otherwise, returns false. */ bool pmap_page_is_mapped(vm_page_t m) { struct rwlock *lock; bool rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (false); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_runlock(lock); return (rv); } static void pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, struct spglist *free, bool superpage) { struct md_page *pvh; vm_page_t mpte, mt; if (superpage) { pmap_resident_count_dec(pmap, Ln_ENTRIES); pvh = pa_to_pvh(m->phys_addr); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[Ln_ENTRIES]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list) && (mt->a.flags & PGA_WRITEABLE) != 0) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: pte page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->ref_count == Ln_ENTRIES, ("pmap_remove_pages: pte page ref count error")); mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } else { pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->a.flags & PGA_WRITEABLE) != 0) { pvh = pa_to_pvh(m->phys_addr); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. */ void pmap_remove_pages(pmap_t pmap) { struct spglist free; pd_entry_t ptepde; pt_entry_t *pte, tpte; vm_page_t m, mt; pv_entry_t pv; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx; bool superpage; lock = NULL; SLIST_INIT(&free); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_l1(pmap, pv->pv_va); ptepde = pmap_load(pte); pte = pmap_l1_to_l2(pte, pv->pv_va); tpte = pmap_load(pte); if ((tpte & PTE_RWX) != 0) { superpage = true; } else { ptepde = tpte; pte = pmap_l2_to_l3(pte, pv->pv_va); tpte = pmap_load(pte); superpage = false; } /* * We cannot remove wired pages from a * process' mapping at this time. */ if (tpte & PTE_SW_WIRED) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad pte %#jx", (uintmax_t)tpte)); pmap_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { if (superpage) for (mt = m; mt < &m[Ln_ENTRIES]; mt++) vm_page_dirty(mt); else vm_page_dirty(m); } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; pmap_remove_pages_pv(pmap, m, pv, &free, superpage); pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, false); } static bool pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct md_page *pvh; struct rwlock *lock; pd_entry_t *l2; pt_entry_t *l3, mask; pv_entry_t pv; pmap_t pmap; int md_gen, pvh_gen; bool rv; mask = 0; if (modified) mask |= PTE_D; if (accessed) mask |= PTE_A; rv = FALSE; rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l3 = pmap_l3(pmap, pv->pv_va); rv = (pmap_load(l3) & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); rv = (pmap_load(l2) & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); rw_runlock(&pvh_global_lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not busied then this check is racy. */ if (!pmap_page_is_write_mapped(m)) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *l3; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); l3 = pmap_l3(pmap, addr); if (l3 != NULL && pmap_load(l3) != 0) { rv = TRUE; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pd_entry_t *l2; pt_entry_t *l3, oldl3, newl3; pv_entry_t next_pv, pv; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); rw_rlock(&pvh_global_lock); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); if ((pmap_load(l2) & PTE_W) != 0) (void)pmap_demote_l2_locked(pmap, l2, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } l3 = pmap_l3(pmap, pv->pv_va); oldl3 = pmap_load(l3); retry: if ((oldl3 & PTE_W) != 0) { newl3 = oldl3 & ~(PTE_D | PTE_W); if (!atomic_fcmpset_long(l3, &oldl3, newl3)) goto retry; if ((oldl3 & PTE_D) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); rw_runlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct spglist free; struct md_page *pvh; struct rwlock *lock; pv_entry_t pv, pvf; pmap_t pmap; pd_entry_t *l2, l2e; pt_entry_t *l3, l3e; vm_paddr_t pa; vm_offset_t va; int cleared, md_gen, not_cleared, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); lock = PHYS_TO_PV_LIST_LOCK(pa); rw_rlock(&pvh_global_lock); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); l2e = pmap_load(l2); if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { /* * Although l2e is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((l2e & PTE_A) != 0) { /* * Since this reference bit is shared by 512 4KB * pages, it should not be cleared every time it is * tested. Apply a simple "hash" function on the * physical page number, the virtual superpage number, * and the pmap address to select one 4KB page out of * the 512 on which testing the reference bit will * result in clearing that reference bit. This * function is designed to avoid the selection of the * same 4KB page for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && (l2e & PTE_SW_WIRED) == 0) { pmap_clear_bits(l2, PTE_A); pmap_invalidate_page(pmap, va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } l2 = pmap_l2(pmap, pv->pv_va); KASSERT((pmap_load(l2) & PTE_RX) == 0, ("pmap_ts_referenced: found an invalid l2 table")); l3 = pmap_l2_to_l3(l2, pv->pv_va); l3e = pmap_load(l3); if ((l3e & PTE_D) != 0) vm_page_dirty(m); if ((l3e & PTE_A) != 0) { if ((l3e & PTE_SW_WIRED) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for * them is wasted effort. We do the * hard work for unwired pages only. */ pmap_clear_bits(l3, PTE_A); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); rw_runlock(&pvh_global_lock); vm_page_free_pages_toq(&free, false); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t *l2, oldl2; pt_entry_t *l3; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); vm_page_assert_busied(m); if (!pmap_page_is_write_mapped(m)) return; /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->a.flags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(&pvh_global_lock); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); oldl2 = pmap_load(l2); /* If oldl2 has PTE_W set, then it also has PTE_D set. */ if ((oldl2 & PTE_W) != 0 && pmap_demote_l2_locked(pmap, l2, va, &lock) && (oldl2 & PTE_SW_WIRED) == 0) { /* * Write protect the mapping to a single page so that * a subsequent write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); l3 = pmap_l2_to_l3(l2, va); pmap_clear_bits(l3, PTE_D | PTE_W); vm_page_dirty(m); pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); KASSERT((pmap_load(l2) & PTE_RWX) == 0, ("pmap_clear_modify: found a 2mpage in page %p's pv list", m)); l3 = pmap_l2_to_l3(l2, pv->pv_va); if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { pmap_clear_bits(l3, PTE_D | PTE_W); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); rw_runlock(&pvh_global_lock); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return ((void *)PHYS_TO_DMAP(pa)); } void pmap_unmapbios(vm_paddr_t pa, vm_size_t size) { } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pv_memattr = ma; } /* * Perform the pmap work for mincore(2). If the page is not both referenced and * modified by this pmap, returns its physical address so that the caller can * find other mappings. */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) { pt_entry_t *l2, *l3, tpte; vm_paddr_t pa; int val; bool managed; PMAP_LOCK(pmap); l2 = pmap_l2(pmap, addr); if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { if ((tpte & PTE_RWX) != 0) { pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); val = MINCORE_INCORE | MINCORE_PSIND(1); } else { l3 = pmap_l2_to_l3(l2, addr); tpte = pmap_load(l3); if ((tpte & PTE_V) == 0) { PMAP_UNLOCK(pmap); return (0); } pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); val = MINCORE_INCORE; } if ((tpte & PTE_D) != 0) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((tpte & PTE_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; } else { managed = false; val = 0; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { *pap = pa; } PMAP_UNLOCK(pmap); return (val); } void pmap_activate_sw(struct thread *td) { pmap_t oldpmap, pmap; u_int hart; oldpmap = PCPU_GET(curpmap); pmap = vmspace_pmap(td->td_proc->p_vmspace); if (pmap == oldpmap) return; load_satp(pmap->pm_satp); hart = PCPU_GET(hart); #ifdef SMP CPU_SET_ATOMIC(hart, &pmap->pm_active); CPU_CLR_ATOMIC(hart, &oldpmap->pm_active); #else CPU_SET(hart, &pmap->pm_active); CPU_CLR(hart, &oldpmap->pm_active); #endif PCPU_SET(curpmap, pmap); sfence_vma(); } void pmap_activate(struct thread *td) { critical_enter(); pmap_activate_sw(td); critical_exit(); } void pmap_activate_boot(pmap_t pmap) { u_int hart; hart = PCPU_GET(hart); #ifdef SMP CPU_SET_ATOMIC(hart, &pmap->pm_active); #else CPU_SET(hart, &pmap->pm_active); #endif PCPU_SET(curpmap, pmap); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) { cpuset_t mask; /* * From the RISC-V User-Level ISA V2.2: * * "To make a store to instruction memory visible to all * RISC-V harts, the writing hart has to execute a data FENCE * before requesting that all remote RISC-V harts execute a * FENCE.I." * * However, this is slightly misleading; we still need to * perform a FENCE.I for the local hart, as FENCE does nothing * for its icache. FENCE.I alone is also sufficient for the * local hart. */ sched_pin(); mask = all_harts; CPU_CLR(PCPU_GET(hart), &mask); fence_i(); if (!CPU_EMPTY(&mask) && smp_started) { fence(); sbi_remote_fence_i(mask.__bits); } sched_unpin(); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < L2_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & L2_OFFSET; if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || (*addr & L2_OFFSET) == superpage_offset) return; if ((*addr & L2_OFFSET) < superpage_offset) *addr = (*addr & ~L2_OFFSET) + superpage_offset; else *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; int error, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= DMAP_MAX_PHYSADDR) { panic( "pmap_map_io_transient: TODO: Map out of DMAP data"); } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= DMAP_MAX_PHYSADDR) { panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); } } } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK); } bool pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2, pt_entry_t **l3) { pd_entry_t *l1p, *l2p; /* Get l1 directory entry. */ l1p = pmap_l1(pmap, va); *l1 = l1p; if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0) return (false); if ((pmap_load(l1p) & PTE_RX) != 0) { *l2 = NULL; *l3 = NULL; return (true); } /* Get l2 directory entry. */ l2p = pmap_l1_to_l2(l1p, va); *l2 = l2p; if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0) return (false); if ((pmap_load(l2p) & PTE_RX) != 0) { *l3 = NULL; return (true); } /* Get l3 page table entry. */ *l3 = pmap_l2_to_l3(l2p, va); return (true); } /* * Track a range of the kernel's virtual address space that is contiguous * in various mapping attributes. */ struct pmap_kernel_map_range { vm_offset_t sva; pt_entry_t attrs; int l3pages; int l2pages; int l1pages; }; static void sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t eva) { if (eva <= range->sva) return; sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %d %d %d\n", range->sva, eva, (range->attrs & PTE_W) == PTE_W ? 'w' : '-', (range->attrs & PTE_X) == PTE_X ? 'x' : '-', (range->attrs & PTE_U) == PTE_U ? 'u' : 's', (range->attrs & PTE_G) == PTE_G ? 'g' : '-', range->l1pages, range->l2pages, range->l3pages); /* Reset to sentinel value. */ range->sva = 0xfffffffffffffffful; } /* * Determine whether the attributes specified by a page table entry match those * being tracked by the current range. */ static bool sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) { return (range->attrs == attrs); } static void sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, pt_entry_t attrs) { memset(range, 0, sizeof(*range)); range->sva = va; range->attrs = attrs; } /* * Given a leaf PTE, derive the mapping's attributes. If they do not match * those of the current run, dump the address range and its attributes, and * begin a new run. */ static void sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e) { pt_entry_t attrs; /* The PTE global bit is inherited by lower levels. */ attrs = l1e & PTE_G; if ((l1e & PTE_RWX) != 0) attrs |= l1e & (PTE_RWX | PTE_U); else if (l2e != 0) attrs |= l2e & PTE_G; if ((l2e & PTE_RWX) != 0) attrs |= l2e & (PTE_RWX | PTE_U); else if (l3e != 0) attrs |= l3e & (PTE_RWX | PTE_U | PTE_G); if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { sysctl_kmaps_dump(sb, range, va); sysctl_kmaps_reinit(range, va, attrs); } } static int sysctl_kmaps(SYSCTL_HANDLER_ARGS) { struct pmap_kernel_map_range range; struct sbuf sbuf, *sb; pd_entry_t l1e, *l2, l2e; pt_entry_t *l3, l3e; vm_offset_t sva; vm_paddr_t pa; int error, i, j, k; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = &sbuf; sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); /* Sentinel value. */ range.sva = 0xfffffffffffffffful; /* * Iterate over the kernel page tables without holding the kernel pmap * lock. Kernel page table pages are never freed, so at worst we will * observe inconsistencies in the output. */ sva = VM_MIN_KERNEL_ADDRESS; for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) { if (i == pmap_l1_index(DMAP_MIN_ADDRESS)) sbuf_printf(sb, "\nDirect map:\n"); else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS)) sbuf_printf(sb, "\nKernel map:\n"); l1e = kernel_pmap->pm_l1[i]; if ((l1e & PTE_V) == 0) { sysctl_kmaps_dump(sb, &range, sva); sva += L1_SIZE; continue; } if ((l1e & PTE_RWX) != 0) { sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0); range.l1pages++; sva += L1_SIZE; continue; } pa = PTE_TO_PHYS(l1e); l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) { l2e = l2[j]; if ((l2e & PTE_V) == 0) { sysctl_kmaps_dump(sb, &range, sva); sva += L2_SIZE; continue; } if ((l2e & PTE_RWX) != 0) { sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0); range.l2pages++; sva += L2_SIZE; continue; } pa = PTE_TO_PHYS(l2e); l3 = (pd_entry_t *)PHYS_TO_DMAP(pa); for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++, sva += L3_SIZE) { l3e = l3[k]; if ((l3e & PTE_V) == 0) { sysctl_kmaps_dump(sb, &range, sva); continue; } sysctl_kmaps_check(sb, &range, sva, l1e, l2e, l3e); range.l3pages++; } } } error = sbuf_finish(sb); sbuf_delete(sb); return (error); } SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_kmaps, "A", "Dump kernel address layout"); Index: head/sys/riscv/riscv/uma_machdep.c =================================================================== --- head/sys/riscv/riscv/uma_machdep.c (revision 366710) +++ head/sys/riscv/riscv/uma_machdep.c (revision 366711) @@ -1,77 +1,78 @@ /*- * Copyright (c) 2003 Alan L. Cox * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include +#include #include #include +#include #include #include #include -#include void * uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, int wait) { vm_page_t m; vm_paddr_t pa; void *va; *flags = UMA_SLAB_PRIV; m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) return (NULL); pa = m->phys_addr; if ((wait & M_NODUMP) == 0) dump_add_page(pa); va = (void *)PHYS_TO_DMAP(pa); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero(va, PAGE_SIZE); return (va); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; vm_paddr_t pa; pa = DMAP_TO_PHYS((vm_offset_t)mem); dump_drop_page(pa); m = PHYS_TO_VM_PAGE(pa); vm_page_unwire_noq(m); vm_page_free(m); } Index: head/sys/vm/uma_core.c =================================================================== --- head/sys/vm/uma_core.c (revision 366710) +++ head/sys/vm/uma_core.c (revision 366711) @@ -1,5495 +1,5496 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2019 Jeffrey Roberson * Copyright (c) 2004, 2005 Bosko Milekic * Copyright (c) 2004-2006 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * uma_core.c Implementation of the Universal Memory allocator * * This allocator is intended to replace the multitude of similar object caches * in the standard FreeBSD kernel. The intent is to be flexible as well as * efficient. A primary design goal is to return unused memory to the rest of * the system. This will make the system as a whole more flexible due to the * ability to move memory to subsystems which most need it instead of leaving * pools of reserved memory unused. * * The basic ideas stem from similar slab/zone based allocators whose algorithms * are well known. * */ /* * TODO: * - Improve memory usage for large allocations * - Investigate cache size adjustments */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_param.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include -#include #include #include #include #include #include +#include #include #include #include #include #ifdef DEBUG_MEMGUARD #include #endif #include #ifdef INVARIANTS #define UMA_ALWAYS_CTORDTOR 1 #else #define UMA_ALWAYS_CTORDTOR 0 #endif /* * This is the zone and keg from which all zones are spawned. */ static uma_zone_t kegs; static uma_zone_t zones; /* * On INVARIANTS builds, the slab contains a second bitset of the same size, * "dbg_bits", which is laid out immediately after us_free. */ #ifdef INVARIANTS #define SLAB_BITSETS 2 #else #define SLAB_BITSETS 1 #endif /* * These are the two zones from which all offpage uma_slab_ts are allocated. * * One zone is for slab headers that can represent a larger number of items, * making the slabs themselves more efficient, and the other zone is for * headers that are smaller and represent fewer items, making the headers more * efficient. */ #define SLABZONE_SIZE(setsize) \ (sizeof(struct uma_hash_slab) + BITSET_SIZE(setsize) * SLAB_BITSETS) #define SLABZONE0_SETSIZE (PAGE_SIZE / 16) #define SLABZONE1_SETSIZE SLAB_MAX_SETSIZE #define SLABZONE0_SIZE SLABZONE_SIZE(SLABZONE0_SETSIZE) #define SLABZONE1_SIZE SLABZONE_SIZE(SLABZONE1_SETSIZE) static uma_zone_t slabzones[2]; /* * The initial hash tables come out of this zone so they can be allocated * prior to malloc coming up. */ static uma_zone_t hashzone; /* The boot-time adjusted value for cache line alignment. */ int uma_align_cache = 64 - 1; static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets"); static MALLOC_DEFINE(M_UMA, "UMA", "UMA Misc"); /* * Are we allowed to allocate buckets? */ static int bucketdisable = 1; /* Linked list of all kegs in the system */ static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs); /* Linked list of all cache-only zones in the system */ static LIST_HEAD(,uma_zone) uma_cachezones = LIST_HEAD_INITIALIZER(uma_cachezones); /* This RW lock protects the keg list */ static struct rwlock_padalign __exclusive_cache_line uma_rwlock; /* * First available virual address for boot time allocations. */ static vm_offset_t bootstart; static vm_offset_t bootmem; static struct sx uma_reclaim_lock; /* * kmem soft limit, initialized by uma_set_limit(). Ensure that early * allocations don't trigger a wakeup of the reclaim thread. */ unsigned long uma_kmem_limit = LONG_MAX; SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0, "UMA kernel memory soft limit"); unsigned long uma_kmem_total; SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0, "UMA kernel memory usage"); /* Is the VM done starting up? */ static enum { BOOT_COLD, BOOT_KVA, BOOT_PCPU, BOOT_RUNNING, BOOT_SHUTDOWN, } booted = BOOT_COLD; /* * This is the handle used to schedule events that need to happen * outside of the allocation fast path. */ static struct callout uma_callout; #define UMA_TIMEOUT 20 /* Seconds for callout interval. */ /* * This structure is passed as the zone ctor arg so that I don't have to create * a special allocation function just for zones. */ struct uma_zctor_args { const char *name; size_t size; uma_ctor ctor; uma_dtor dtor; uma_init uminit; uma_fini fini; uma_import import; uma_release release; void *arg; uma_keg_t keg; int align; uint32_t flags; }; struct uma_kctor_args { uma_zone_t zone; size_t size; uma_init uminit; uma_fini fini; int align; uint32_t flags; }; struct uma_bucket_zone { uma_zone_t ubz_zone; const char *ubz_name; int ubz_entries; /* Number of items it can hold. */ int ubz_maxsize; /* Maximum allocation size per-item. */ }; /* * Compute the actual number of bucket entries to pack them in power * of two sizes for more efficient space utilization. */ #define BUCKET_SIZE(n) \ (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *)) #define BUCKET_MAX BUCKET_SIZE(256) #define BUCKET_MIN 2 struct uma_bucket_zone bucket_zones[] = { /* Literal bucket sizes. */ { NULL, "2 Bucket", 2, 4096 }, { NULL, "4 Bucket", 4, 3072 }, { NULL, "8 Bucket", 8, 2048 }, { NULL, "16 Bucket", 16, 1024 }, /* Rounded down power of 2 sizes for efficiency. */ { NULL, "32 Bucket", BUCKET_SIZE(32), 512 }, { NULL, "64 Bucket", BUCKET_SIZE(64), 256 }, { NULL, "128 Bucket", BUCKET_SIZE(128), 128 }, { NULL, "256 Bucket", BUCKET_SIZE(256), 64 }, { NULL, NULL, 0} }; /* * Flags and enumerations to be passed to internal functions. */ enum zfreeskip { SKIP_NONE = 0, SKIP_CNT = 0x00000001, SKIP_DTOR = 0x00010000, SKIP_FINI = 0x00020000, }; /* Prototypes.. */ void uma_startup1(vm_offset_t); void uma_startup2(void); static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *contig_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void page_free(void *, vm_size_t, uint8_t); static void pcpu_page_free(void *, vm_size_t, uint8_t); static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); static void bucket_cache_reclaim(uma_zone_t zone, bool); static int keg_ctor(void *, int, void *, int); static void keg_dtor(void *, int, void *); static int zone_ctor(void *, int, void *, int); static void zone_dtor(void *, int, void *); static inline void item_dtor(uma_zone_t zone, void *item, int size, void *udata, enum zfreeskip skip); static int zero_init(void *, int, int); static void zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata, int itemdomain, bool ws); static void zone_foreach(void (*zfunc)(uma_zone_t, void *), void *); static void zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *), void *); static void zone_timeout(uma_zone_t zone, void *); static int hash_alloc(struct uma_hash *, u_int); static int hash_expand(struct uma_hash *, struct uma_hash *); static void hash_free(struct uma_hash *hash); static void uma_timeout(void *); static void uma_shutdown(void); static void *zone_alloc_item(uma_zone_t, void *, int, int); static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); static int zone_alloc_limit(uma_zone_t zone, int count, int flags); static void zone_free_limit(uma_zone_t zone, int count); static void bucket_enable(void); static void bucket_init(void); static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); static void bucket_free(uma_zone_t zone, uma_bucket_t, void *); static void bucket_zone_drain(void); static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int); static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab); static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item); static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags); static int zone_import(void *, void **, int, int, int); static void zone_release(void *, void **, int); static bool cache_alloc(uma_zone_t, uma_cache_t, void *, int); static bool cache_free(uma_zone_t, uma_cache_t, void *, void *, int); static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS); static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS); static uint64_t uma_zone_get_allocs(uma_zone_t zone); static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Memory allocation debugging"); #ifdef INVARIANTS static uint64_t uma_keg_get_allocs(uma_keg_t zone); static inline struct noslabbits *slab_dbg_bits(uma_slab_t slab, uma_keg_t keg); static bool uma_dbg_kskip(uma_keg_t keg, void *mem); static bool uma_dbg_zskip(uma_zone_t zone, void *mem); static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item); static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item); static u_int dbg_divisor = 1; SYSCTL_UINT(_vm_debug, OID_AUTO, divisor, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0, "Debug & thrash every this item in memory allocator"); static counter_u64_t uma_dbg_cnt = EARLY_COUNTER; static counter_u64_t uma_skip_cnt = EARLY_COUNTER; SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD, &uma_dbg_cnt, "memory items debugged"); SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD, &uma_skip_cnt, "memory items skipped, not debugged"); #endif SYSCTL_NODE(_vm, OID_AUTO, uma, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Universal Memory Allocator"); SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_INT, 0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones"); SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_STRUCT, 0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats"); static int zone_warnings = 1; SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0, "Warn when UMA zones becomes full"); static int multipage_slabs = 1; TUNABLE_INT("vm.debug.uma_multipage_slabs", &multipage_slabs); SYSCTL_INT(_vm_debug, OID_AUTO, uma_multipage_slabs, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &multipage_slabs, 0, "UMA may choose larger slab sizes for better efficiency"); /* * Select the slab zone for an offpage slab with the given maximum item count. */ static inline uma_zone_t slabzone(int ipers) { return (slabzones[ipers > SLABZONE0_SETSIZE]); } /* * This routine checks to see whether or not it's safe to enable buckets. */ static void bucket_enable(void) { KASSERT(booted >= BOOT_KVA, ("Bucket enable before init")); bucketdisable = vm_page_count_min(); } /* * Initialize bucket_zones, the array of zones of buckets of various sizes. * * For each zone, calculate the memory required for each bucket, consisting * of the header and an array of pointers. */ static void bucket_init(void) { struct uma_bucket_zone *ubz; int size; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) { size = roundup(sizeof(struct uma_bucket), sizeof(void *)); size += sizeof(void *) * ubz->ubz_entries; ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_FIRSTTOUCH); } } /* * Given a desired number of entries for a bucket, return the zone from which * to allocate the bucket. */ static struct uma_bucket_zone * bucket_zone_lookup(int entries) { struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) if (ubz->ubz_entries >= entries) return (ubz); ubz--; return (ubz); } static struct uma_bucket_zone * bucket_zone_max(uma_zone_t zone, int nitems) { struct uma_bucket_zone *ubz; int bpcpu; bpcpu = 2; if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) /* Count the cross-domain bucket. */ bpcpu++; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) if (ubz->ubz_entries * bpcpu * mp_ncpus > nitems) break; if (ubz == &bucket_zones[0]) ubz = NULL; else ubz--; return (ubz); } static int bucket_select(int size) { struct uma_bucket_zone *ubz; ubz = &bucket_zones[0]; if (size > ubz->ubz_maxsize) return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1); for (; ubz->ubz_entries != 0; ubz++) if (ubz->ubz_maxsize < size) break; ubz--; return (ubz->ubz_entries); } static uma_bucket_t bucket_alloc(uma_zone_t zone, void *udata, int flags) { struct uma_bucket_zone *ubz; uma_bucket_t bucket; /* * Don't allocate buckets early in boot. */ if (__predict_false(booted < BOOT_KVA)) return (NULL); /* * To limit bucket recursion we store the original zone flags * in a cookie passed via zalloc_arg/zfree_arg. This allows the * NOVM flag to persist even through deep recursions. We also * store ZFLAG_BUCKET once we have recursed attempting to allocate * a bucket for a bucket zone so we do not allow infinite bucket * recursion. This cookie will even persist to frees of unused * buckets via the allocation path or bucket allocations in the * free path. */ if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) udata = (void *)(uintptr_t)zone->uz_flags; else { if ((uintptr_t)udata & UMA_ZFLAG_BUCKET) return (NULL); udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET); } if (((uintptr_t)udata & UMA_ZONE_VM) != 0) flags |= M_NOVM; ubz = bucket_zone_lookup(zone->uz_bucket_size); if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0) ubz++; bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags); if (bucket) { #ifdef INVARIANTS bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries); #endif bucket->ub_cnt = 0; bucket->ub_entries = ubz->ubz_entries; bucket->ub_seq = SMR_SEQ_INVALID; CTR3(KTR_UMA, "bucket_alloc: zone %s(%p) allocated bucket %p", zone->uz_name, zone, bucket); } return (bucket); } static void bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata) { struct uma_bucket_zone *ubz; if (bucket->ub_cnt != 0) bucket_drain(zone, bucket); KASSERT(bucket->ub_cnt == 0, ("bucket_free: Freeing a non free bucket.")); KASSERT(bucket->ub_seq == SMR_SEQ_INVALID, ("bucket_free: Freeing an SMR bucket.")); if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) udata = (void *)(uintptr_t)zone->uz_flags; ubz = bucket_zone_lookup(bucket->ub_entries); uma_zfree_arg(ubz->ubz_zone, bucket, udata); } static void bucket_zone_drain(void) { struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) uma_zone_reclaim(ubz->ubz_zone, UMA_RECLAIM_DRAIN); } /* * Acquire the domain lock and record contention. */ static uma_zone_domain_t zone_domain_lock(uma_zone_t zone, int domain) { uma_zone_domain_t zdom; bool lockfail; zdom = ZDOM_GET(zone, domain); lockfail = false; if (ZDOM_OWNED(zdom)) lockfail = true; ZDOM_LOCK(zdom); /* This is unsynchronized. The counter does not need to be precise. */ if (lockfail && zone->uz_bucket_size < zone->uz_bucket_size_max) zone->uz_bucket_size++; return (zdom); } /* * Search for the domain with the least cached items and return it if it * is out of balance with the preferred domain. */ static __noinline int zone_domain_lowest(uma_zone_t zone, int pref) { long least, nitems, prefitems; int domain; int i; prefitems = least = LONG_MAX; domain = 0; for (i = 0; i < vm_ndomains; i++) { nitems = ZDOM_GET(zone, i)->uzd_nitems; if (nitems < least) { domain = i; least = nitems; } if (domain == pref) prefitems = nitems; } if (prefitems < least * 2) return (pref); return (domain); } /* * Search for the domain with the most cached items and return it or the * preferred domain if it has enough to proceed. */ static __noinline int zone_domain_highest(uma_zone_t zone, int pref) { long most, nitems; int domain; int i; if (ZDOM_GET(zone, pref)->uzd_nitems > BUCKET_MAX) return (pref); most = 0; domain = 0; for (i = 0; i < vm_ndomains; i++) { nitems = ZDOM_GET(zone, i)->uzd_nitems; if (nitems > most) { domain = i; most = nitems; } } return (domain); } /* * Safely subtract cnt from imax. */ static void zone_domain_imax_sub(uma_zone_domain_t zdom, int cnt) { long new; long old; old = zdom->uzd_imax; do { if (old <= cnt) new = 0; else new = old - cnt; } while (atomic_fcmpset_long(&zdom->uzd_imax, &old, new) == 0); } /* * Set the maximum imax value. */ static void zone_domain_imax_set(uma_zone_domain_t zdom, int nitems) { long old; old = zdom->uzd_imax; do { if (old >= nitems) break; } while (atomic_fcmpset_long(&zdom->uzd_imax, &old, nitems) == 0); } /* * Attempt to satisfy an allocation by retrieving a full bucket from one of the * zone's caches. If a bucket is found the zone is not locked on return. */ static uma_bucket_t zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, bool reclaim) { uma_bucket_t bucket; int i; bool dtor = false; ZDOM_LOCK_ASSERT(zdom); if ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) == NULL) return (NULL); /* SMR Buckets can not be re-used until readers expire. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && bucket->ub_seq != SMR_SEQ_INVALID) { if (!smr_poll(zone->uz_smr, bucket->ub_seq, false)) return (NULL); bucket->ub_seq = SMR_SEQ_INVALID; dtor = (zone->uz_dtor != NULL) || UMA_ALWAYS_CTORDTOR; if (STAILQ_NEXT(bucket, ub_link) != NULL) zdom->uzd_seq = STAILQ_NEXT(bucket, ub_link)->ub_seq; } STAILQ_REMOVE_HEAD(&zdom->uzd_buckets, ub_link); KASSERT(zdom->uzd_nitems >= bucket->ub_cnt, ("%s: item count underflow (%ld, %d)", __func__, zdom->uzd_nitems, bucket->ub_cnt)); KASSERT(bucket->ub_cnt > 0, ("%s: empty bucket in bucket cache", __func__)); zdom->uzd_nitems -= bucket->ub_cnt; /* * Shift the bounds of the current WSS interval to avoid * perturbing the estimate. */ if (reclaim) { zdom->uzd_imin -= lmin(zdom->uzd_imin, bucket->ub_cnt); zone_domain_imax_sub(zdom, bucket->ub_cnt); } else if (zdom->uzd_imin > zdom->uzd_nitems) zdom->uzd_imin = zdom->uzd_nitems; ZDOM_UNLOCK(zdom); if (dtor) for (i = 0; i < bucket->ub_cnt; i++) item_dtor(zone, bucket->ub_bucket[i], zone->uz_size, NULL, SKIP_NONE); return (bucket); } /* * Insert a full bucket into the specified cache. The "ws" parameter indicates * whether the bucket's contents should be counted as part of the zone's working * set. The bucket may be freed if it exceeds the bucket limit. */ static void zone_put_bucket(uma_zone_t zone, int domain, uma_bucket_t bucket, void *udata, const bool ws) { uma_zone_domain_t zdom; /* We don't cache empty buckets. This can happen after a reclaim. */ if (bucket->ub_cnt == 0) goto out; zdom = zone_domain_lock(zone, domain); /* * Conditionally set the maximum number of items. */ zdom->uzd_nitems += bucket->ub_cnt; if (__predict_true(zdom->uzd_nitems < zone->uz_bucket_max)) { if (ws) zone_domain_imax_set(zdom, zdom->uzd_nitems); if (STAILQ_EMPTY(&zdom->uzd_buckets)) zdom->uzd_seq = bucket->ub_seq; /* * Try to promote reuse of recently used items. For items * protected by SMR, try to defer reuse to minimize polling. */ if (bucket->ub_seq == SMR_SEQ_INVALID) STAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); else STAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link); ZDOM_UNLOCK(zdom); return; } zdom->uzd_nitems -= bucket->ub_cnt; ZDOM_UNLOCK(zdom); out: bucket_free(zone, bucket, udata); } /* Pops an item out of a per-cpu cache bucket. */ static inline void * cache_bucket_pop(uma_cache_t cache, uma_cache_bucket_t bucket) { void *item; CRITICAL_ASSERT(curthread); bucket->ucb_cnt--; item = bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt]; #ifdef INVARIANTS bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = NULL; KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled.")); #endif cache->uc_allocs++; return (item); } /* Pushes an item into a per-cpu cache bucket. */ static inline void cache_bucket_push(uma_cache_t cache, uma_cache_bucket_t bucket, void *item) { CRITICAL_ASSERT(curthread); KASSERT(bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] == NULL, ("uma_zfree: Freeing to non free bucket index.")); bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = item; bucket->ucb_cnt++; cache->uc_frees++; } /* * Unload a UMA bucket from a per-cpu cache. */ static inline uma_bucket_t cache_bucket_unload(uma_cache_bucket_t bucket) { uma_bucket_t b; b = bucket->ucb_bucket; if (b != NULL) { MPASS(b->ub_entries == bucket->ucb_entries); b->ub_cnt = bucket->ucb_cnt; bucket->ucb_bucket = NULL; bucket->ucb_entries = bucket->ucb_cnt = 0; } return (b); } static inline uma_bucket_t cache_bucket_unload_alloc(uma_cache_t cache) { return (cache_bucket_unload(&cache->uc_allocbucket)); } static inline uma_bucket_t cache_bucket_unload_free(uma_cache_t cache) { return (cache_bucket_unload(&cache->uc_freebucket)); } static inline uma_bucket_t cache_bucket_unload_cross(uma_cache_t cache) { return (cache_bucket_unload(&cache->uc_crossbucket)); } /* * Load a bucket into a per-cpu cache bucket. */ static inline void cache_bucket_load(uma_cache_bucket_t bucket, uma_bucket_t b) { CRITICAL_ASSERT(curthread); MPASS(bucket->ucb_bucket == NULL); MPASS(b->ub_seq == SMR_SEQ_INVALID); bucket->ucb_bucket = b; bucket->ucb_cnt = b->ub_cnt; bucket->ucb_entries = b->ub_entries; } static inline void cache_bucket_load_alloc(uma_cache_t cache, uma_bucket_t b) { cache_bucket_load(&cache->uc_allocbucket, b); } static inline void cache_bucket_load_free(uma_cache_t cache, uma_bucket_t b) { cache_bucket_load(&cache->uc_freebucket, b); } #ifdef NUMA static inline void cache_bucket_load_cross(uma_cache_t cache, uma_bucket_t b) { cache_bucket_load(&cache->uc_crossbucket, b); } #endif /* * Copy and preserve ucb_spare. */ static inline void cache_bucket_copy(uma_cache_bucket_t b1, uma_cache_bucket_t b2) { b1->ucb_bucket = b2->ucb_bucket; b1->ucb_entries = b2->ucb_entries; b1->ucb_cnt = b2->ucb_cnt; } /* * Swap two cache buckets. */ static inline void cache_bucket_swap(uma_cache_bucket_t b1, uma_cache_bucket_t b2) { struct uma_cache_bucket b3; CRITICAL_ASSERT(curthread); cache_bucket_copy(&b3, b1); cache_bucket_copy(b1, b2); cache_bucket_copy(b2, &b3); } /* * Attempt to fetch a bucket from a zone on behalf of the current cpu cache. */ static uma_bucket_t cache_fetch_bucket(uma_zone_t zone, uma_cache_t cache, int domain) { uma_zone_domain_t zdom; uma_bucket_t bucket; /* * Avoid the lock if possible. */ zdom = ZDOM_GET(zone, domain); if (zdom->uzd_nitems == 0) return (NULL); if ((cache_uz_flags(cache) & UMA_ZONE_SMR) != 0 && !smr_poll(zone->uz_smr, zdom->uzd_seq, false)) return (NULL); /* * Check the zone's cache of buckets. */ zdom = zone_domain_lock(zone, domain); if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL) return (bucket); ZDOM_UNLOCK(zdom); return (NULL); } static void zone_log_warning(uma_zone_t zone) { static const struct timeval warninterval = { 300, 0 }; if (!zone_warnings || zone->uz_warning == NULL) return; if (ratecheck(&zone->uz_ratecheck, &warninterval)) printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning); } static inline void zone_maxaction(uma_zone_t zone) { if (zone->uz_maxaction.ta_func != NULL) taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction); } /* * Routine called by timeout which is used to fire off some time interval * based calculations. (stats, hash size, etc.) * * Arguments: * arg Unused * * Returns: * Nothing */ static void uma_timeout(void *unused) { bucket_enable(); zone_foreach(zone_timeout, NULL); /* Reschedule this event */ callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); } /* * Update the working set size estimate for the zone's bucket cache. * The constants chosen here are somewhat arbitrary. With an update period of * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the * last 100s. */ static void zone_domain_update_wss(uma_zone_domain_t zdom) { long wss; ZDOM_LOCK(zdom); MPASS(zdom->uzd_imax >= zdom->uzd_imin); wss = zdom->uzd_imax - zdom->uzd_imin; zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems; zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5; ZDOM_UNLOCK(zdom); } /* * Routine to perform timeout driven calculations. This expands the * hashes and does per cpu statistics aggregation. * * Returns nothing. */ static void zone_timeout(uma_zone_t zone, void *unused) { uma_keg_t keg; u_int slabs, pages; if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0) goto update_wss; keg = zone->uz_keg; /* * Hash zones are non-numa by definition so the first domain * is the only one present. */ KEG_LOCK(keg, 0); pages = keg->uk_domain[0].ud_pages; /* * Expand the keg hash table. * * This is done if the number of slabs is larger than the hash size. * What I'm trying to do here is completely reduce collisions. This * may be a little aggressive. Should I allow for two collisions max? */ if ((slabs = pages / keg->uk_ppera) > keg->uk_hash.uh_hashsize) { struct uma_hash newhash; struct uma_hash oldhash; int ret; /* * This is so involved because allocating and freeing * while the keg lock is held will lead to deadlock. * I have to do everything in stages and check for * races. */ KEG_UNLOCK(keg, 0); ret = hash_alloc(&newhash, 1 << fls(slabs)); KEG_LOCK(keg, 0); if (ret) { if (hash_expand(&keg->uk_hash, &newhash)) { oldhash = keg->uk_hash; keg->uk_hash = newhash; } else oldhash = newhash; KEG_UNLOCK(keg, 0); hash_free(&oldhash); goto update_wss; } } KEG_UNLOCK(keg, 0); update_wss: for (int i = 0; i < vm_ndomains; i++) zone_domain_update_wss(ZDOM_GET(zone, i)); } /* * Allocate and zero fill the next sized hash table from the appropriate * backing store. * * Arguments: * hash A new hash structure with the old hash size in uh_hashsize * * Returns: * 1 on success and 0 on failure. */ static int hash_alloc(struct uma_hash *hash, u_int size) { size_t alloc; KASSERT(powerof2(size), ("hash size must be power of 2")); if (size > UMA_HASH_SIZE_INIT) { hash->uh_hashsize = size; alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; hash->uh_slab_hash = malloc(alloc, M_UMAHASH, M_NOWAIT); } else { alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; hash->uh_slab_hash = zone_alloc_item(hashzone, NULL, UMA_ANYDOMAIN, M_WAITOK); hash->uh_hashsize = UMA_HASH_SIZE_INIT; } if (hash->uh_slab_hash) { bzero(hash->uh_slab_hash, alloc); hash->uh_hashmask = hash->uh_hashsize - 1; return (1); } return (0); } /* * Expands the hash table for HASH zones. This is done from zone_timeout * to reduce collisions. This must not be done in the regular allocation * path, otherwise, we can recurse on the vm while allocating pages. * * Arguments: * oldhash The hash you want to expand * newhash The hash structure for the new table * * Returns: * Nothing * * Discussion: */ static int hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) { uma_hash_slab_t slab; u_int hval; u_int idx; if (!newhash->uh_slab_hash) return (0); if (oldhash->uh_hashsize >= newhash->uh_hashsize) return (0); /* * I need to investigate hash algorithms for resizing without a * full rehash. */ for (idx = 0; idx < oldhash->uh_hashsize; idx++) while (!LIST_EMPTY(&oldhash->uh_slab_hash[idx])) { slab = LIST_FIRST(&oldhash->uh_slab_hash[idx]); LIST_REMOVE(slab, uhs_hlink); hval = UMA_HASH(newhash, slab->uhs_data); LIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], slab, uhs_hlink); } return (1); } /* * Free the hash bucket to the appropriate backing store. * * Arguments: * slab_hash The hash bucket we're freeing * hashsize The number of entries in that hash bucket * * Returns: * Nothing */ static void hash_free(struct uma_hash *hash) { if (hash->uh_slab_hash == NULL) return; if (hash->uh_hashsize == UMA_HASH_SIZE_INIT) zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE); else free(hash->uh_slab_hash, M_UMAHASH); } /* * Frees all outstanding items in a bucket * * Arguments: * zone The zone to free to, must be unlocked. * bucket The free/alloc bucket with items. * * Returns: * Nothing */ static void bucket_drain(uma_zone_t zone, uma_bucket_t bucket) { int i; if (bucket->ub_cnt == 0) return; if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && bucket->ub_seq != SMR_SEQ_INVALID) { smr_wait(zone->uz_smr, bucket->ub_seq); bucket->ub_seq = SMR_SEQ_INVALID; for (i = 0; i < bucket->ub_cnt; i++) item_dtor(zone, bucket->ub_bucket[i], zone->uz_size, NULL, SKIP_NONE); } if (zone->uz_fini) for (i = 0; i < bucket->ub_cnt; i++) zone->uz_fini(bucket->ub_bucket[i], zone->uz_size); zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt); if (zone->uz_max_items > 0) zone_free_limit(zone, bucket->ub_cnt); #ifdef INVARIANTS bzero(bucket->ub_bucket, sizeof(void *) * bucket->ub_cnt); #endif bucket->ub_cnt = 0; } /* * Drains the per cpu caches for a zone. * * NOTE: This may only be called while the zone is being torn down, and not * during normal operation. This is necessary in order that we do not have * to migrate CPUs to drain the per-CPU caches. * * Arguments: * zone The zone to drain, must be unlocked. * * Returns: * Nothing */ static void cache_drain(uma_zone_t zone) { uma_cache_t cache; uma_bucket_t bucket; smr_seq_t seq; int cpu; /* * XXX: It is safe to not lock the per-CPU caches, because we're * tearing down the zone anyway. I.e., there will be no further use * of the caches at this point. * * XXX: It would good to be able to assert that the zone is being * torn down to prevent improper use of cache_drain(). */ seq = SMR_SEQ_INVALID; if ((zone->uz_flags & UMA_ZONE_SMR) != 0) seq = smr_advance(zone->uz_smr); CPU_FOREACH(cpu) { cache = &zone->uz_cpu[cpu]; bucket = cache_bucket_unload_alloc(cache); if (bucket != NULL) bucket_free(zone, bucket, NULL); bucket = cache_bucket_unload_free(cache); if (bucket != NULL) { bucket->ub_seq = seq; bucket_free(zone, bucket, NULL); } bucket = cache_bucket_unload_cross(cache); if (bucket != NULL) { bucket->ub_seq = seq; bucket_free(zone, bucket, NULL); } } bucket_cache_reclaim(zone, true); } static void cache_shrink(uma_zone_t zone, void *unused) { if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; zone->uz_bucket_size = (zone->uz_bucket_size_min + zone->uz_bucket_size) / 2; } static void cache_drain_safe_cpu(uma_zone_t zone, void *unused) { uma_cache_t cache; uma_bucket_t b1, b2, b3; int domain; if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; b1 = b2 = b3 = NULL; critical_enter(); cache = &zone->uz_cpu[curcpu]; domain = PCPU_GET(domain); b1 = cache_bucket_unload_alloc(cache); /* * Don't flush SMR zone buckets. This leaves the zone without a * bucket and forces every free to synchronize(). */ if ((zone->uz_flags & UMA_ZONE_SMR) == 0) { b2 = cache_bucket_unload_free(cache); b3 = cache_bucket_unload_cross(cache); } critical_exit(); if (b1 != NULL) zone_free_bucket(zone, b1, NULL, domain, false); if (b2 != NULL) zone_free_bucket(zone, b2, NULL, domain, false); if (b3 != NULL) { /* Adjust the domain so it goes to zone_free_cross. */ domain = (domain + 1) % vm_ndomains; zone_free_bucket(zone, b3, NULL, domain, false); } } /* * Safely drain per-CPU caches of a zone(s) to alloc bucket. * This is an expensive call because it needs to bind to all CPUs * one by one and enter a critical section on each of them in order * to safely access their cache buckets. * Zone lock must not be held on call this function. */ static void pcpu_cache_drain_safe(uma_zone_t zone) { int cpu; /* * Polite bucket sizes shrinking was not enough, shrink aggressively. */ if (zone) cache_shrink(zone, NULL); else zone_foreach(cache_shrink, NULL); CPU_FOREACH(cpu) { thread_lock(curthread); sched_bind(curthread, cpu); thread_unlock(curthread); if (zone) cache_drain_safe_cpu(zone, NULL); else zone_foreach(cache_drain_safe_cpu, NULL); } thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } /* * Reclaim cached buckets from a zone. All buckets are reclaimed if the caller * requested a drain, otherwise the per-domain caches are trimmed to either * estimated working set size. */ static void bucket_cache_reclaim(uma_zone_t zone, bool drain) { uma_zone_domain_t zdom; uma_bucket_t bucket; long target; int i; /* * Shrink the zone bucket size to ensure that the per-CPU caches * don't grow too large. */ if (zone->uz_bucket_size > zone->uz_bucket_size_min) zone->uz_bucket_size--; for (i = 0; i < vm_ndomains; i++) { /* * The cross bucket is partially filled and not part of * the item count. Reclaim it individually here. */ zdom = ZDOM_GET(zone, i); if ((zone->uz_flags & UMA_ZONE_SMR) == 0 || drain) { ZONE_CROSS_LOCK(zone); bucket = zdom->uzd_cross; zdom->uzd_cross = NULL; ZONE_CROSS_UNLOCK(zone); if (bucket != NULL) bucket_free(zone, bucket, NULL); } /* * If we were asked to drain the zone, we are done only once * this bucket cache is empty. Otherwise, we reclaim items in * excess of the zone's estimated working set size. If the * difference nitems - imin is larger than the WSS estimate, * then the estimate will grow at the end of this interval and * we ignore the historical average. */ ZDOM_LOCK(zdom); target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems - zdom->uzd_imin); while (zdom->uzd_nitems > target) { bucket = zone_fetch_bucket(zone, zdom, true); if (bucket == NULL) break; bucket_free(zone, bucket, NULL); ZDOM_LOCK(zdom); } ZDOM_UNLOCK(zdom); } } static void keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start) { uint8_t *mem; int i; uint8_t flags; CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes", keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera); mem = slab_data(slab, keg); flags = slab->us_flags; i = start; if (keg->uk_fini != NULL) { for (i--; i > -1; i--) #ifdef INVARIANTS /* * trash_fini implies that dtor was trash_dtor. trash_fini * would check that memory hasn't been modified since free, * which executed trash_dtor. * That's why we need to run uma_dbg_kskip() check here, * albeit we don't make skip check for other init/fini * invocations. */ if (!uma_dbg_kskip(keg, slab_item(slab, keg, i)) || keg->uk_fini != trash_fini) #endif keg->uk_fini(slab_item(slab, keg, i), keg->uk_size); } if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) zone_free_item(slabzone(keg->uk_ipers), slab_tohashslab(slab), NULL, SKIP_NONE); keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags); uma_total_dec(PAGE_SIZE * keg->uk_ppera); } /* * Frees pages from a keg back to the system. This is done on demand from * the pageout daemon. * * Returns nothing. */ static void keg_drain(uma_keg_t keg) { struct slabhead freeslabs; uma_domain_t dom; uma_slab_t slab, tmp; int i, n; if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL) return; for (i = 0; i < vm_ndomains; i++) { CTR4(KTR_UMA, "keg_drain %s(%p) domain %d free items: %u", keg->uk_name, keg, i, dom->ud_free_items); dom = &keg->uk_domain[i]; LIST_INIT(&freeslabs); KEG_LOCK(keg, i); if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) { LIST_FOREACH(slab, &dom->ud_free_slab, us_link) UMA_HASH_REMOVE(&keg->uk_hash, slab); } n = dom->ud_free_slabs; LIST_SWAP(&freeslabs, &dom->ud_free_slab, uma_slab, us_link); dom->ud_free_slabs = 0; dom->ud_free_items -= n * keg->uk_ipers; dom->ud_pages -= n * keg->uk_ppera; KEG_UNLOCK(keg, i); LIST_FOREACH_SAFE(slab, &freeslabs, us_link, tmp) keg_free_slab(keg, slab, keg->uk_ipers); } } static void zone_reclaim(uma_zone_t zone, int waitok, bool drain) { /* * Set draining to interlock with zone_dtor() so we can release our * locks as we go. Only dtor() should do a WAITOK call since it * is the only call that knows the structure will still be available * when it wakes up. */ ZONE_LOCK(zone); while (zone->uz_flags & UMA_ZFLAG_RECLAIMING) { if (waitok == M_NOWAIT) goto out; msleep(zone, &ZDOM_GET(zone, 0)->uzd_lock, PVM, "zonedrain", 1); } zone->uz_flags |= UMA_ZFLAG_RECLAIMING; ZONE_UNLOCK(zone); bucket_cache_reclaim(zone, drain); /* * The DRAINING flag protects us from being freed while * we're running. Normally the uma_rwlock would protect us but we * must be able to release and acquire the right lock for each keg. */ if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) keg_drain(zone->uz_keg); ZONE_LOCK(zone); zone->uz_flags &= ~UMA_ZFLAG_RECLAIMING; wakeup(zone); out: ZONE_UNLOCK(zone); } static void zone_drain(uma_zone_t zone, void *unused) { zone_reclaim(zone, M_NOWAIT, true); } static void zone_trim(uma_zone_t zone, void *unused) { zone_reclaim(zone, M_NOWAIT, false); } /* * Allocate a new slab for a keg and inserts it into the partial slab list. * The keg should be unlocked on entry. If the allocation succeeds it will * be locked on return. * * Arguments: * flags Wait flags for the item initialization routine * aflags Wait flags for the slab allocation * * Returns: * The slab that was allocated or NULL if there is no memory and the * caller specified M_NOWAIT. */ static uma_slab_t keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags, int aflags) { uma_domain_t dom; uma_alloc allocf; uma_slab_t slab; unsigned long size; uint8_t *mem; uint8_t sflags; int i; KASSERT(domain >= 0 && domain < vm_ndomains, ("keg_alloc_slab: domain %d out of range", domain)); allocf = keg->uk_allocf; slab = NULL; mem = NULL; if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) { uma_hash_slab_t hslab; hslab = zone_alloc_item(slabzone(keg->uk_ipers), NULL, domain, aflags); if (hslab == NULL) goto fail; slab = &hslab->uhs_slab; } /* * This reproduces the old vm_zone behavior of zero filling pages the * first time they are added to a zone. * * Malloced items are zeroed in uma_zalloc. */ if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) aflags |= M_ZERO; else aflags &= ~M_ZERO; if (keg->uk_flags & UMA_ZONE_NODUMP) aflags |= M_NODUMP; /* zone is passed for legacy reasons. */ size = keg->uk_ppera * PAGE_SIZE; mem = allocf(zone, size, domain, &sflags, aflags); if (mem == NULL) { if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) zone_free_item(slabzone(keg->uk_ipers), slab_tohashslab(slab), NULL, SKIP_NONE); goto fail; } uma_total_inc(size); /* For HASH zones all pages go to the same uma_domain. */ if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) domain = 0; /* Point the slab into the allocated memory */ if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) slab = (uma_slab_t )(mem + keg->uk_pgoff); else slab_tohashslab(slab)->uhs_data = mem; if (keg->uk_flags & UMA_ZFLAG_VTOSLAB) for (i = 0; i < keg->uk_ppera; i++) vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE), zone, slab); slab->us_freecount = keg->uk_ipers; slab->us_flags = sflags; slab->us_domain = domain; BIT_FILL(keg->uk_ipers, &slab->us_free); #ifdef INVARIANTS BIT_ZERO(keg->uk_ipers, slab_dbg_bits(slab, keg)); #endif if (keg->uk_init != NULL) { for (i = 0; i < keg->uk_ipers; i++) if (keg->uk_init(slab_item(slab, keg, i), keg->uk_size, flags) != 0) break; if (i != keg->uk_ipers) { keg_free_slab(keg, slab, i); goto fail; } } KEG_LOCK(keg, domain); CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)", slab, keg->uk_name, keg); if (keg->uk_flags & UMA_ZFLAG_HASH) UMA_HASH_INSERT(&keg->uk_hash, slab, mem); /* * If we got a slab here it's safe to mark it partially used * and return. We assume that the caller is going to remove * at least one item. */ dom = &keg->uk_domain[domain]; LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); dom->ud_pages += keg->uk_ppera; dom->ud_free_items += keg->uk_ipers; return (slab); fail: return (NULL); } /* * This function is intended to be used early on in place of page_alloc() so * that we may use the boot time page cache to satisfy allocations before * the VM is ready. */ static void * startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { vm_paddr_t pa; vm_page_t m; void *mem; int pages; int i; pages = howmany(bytes, PAGE_SIZE); KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__)); *pflag = UMA_SLAB_BOOT; m = vm_page_alloc_contig_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED, pages, (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT); if (m == NULL) return (NULL); pa = VM_PAGE_TO_PHYS(m); for (i = 0; i < pages; i++, pa += PAGE_SIZE) { #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \ defined(__riscv) || defined(__powerpc64__) if ((wait & M_NODUMP) == 0) dump_add_page(pa); #endif } /* Allocate KVA and indirectly advance bootmem. */ mem = (void *)pmap_map(&bootmem, m->phys_addr, m->phys_addr + (pages * PAGE_SIZE), VM_PROT_READ | VM_PROT_WRITE); if ((wait & M_ZERO) != 0) bzero(mem, pages * PAGE_SIZE); return (mem); } static void startup_free(void *mem, vm_size_t bytes) { vm_offset_t va; vm_page_t m; va = (vm_offset_t)mem; m = PHYS_TO_VM_PAGE(pmap_kextract(va)); pmap_remove(kernel_pmap, va, va + bytes); for (; bytes != 0; bytes -= PAGE_SIZE, m++) { #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \ defined(__riscv) || defined(__powerpc64__) dump_drop_page(VM_PAGE_TO_PHYS(m)); #endif vm_page_unwire_noq(m); vm_page_free(m); } } /* * Allocates a number of pages from the system * * Arguments: * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { void *p; /* Returned page */ *pflag = UMA_SLAB_KERNEL; p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait); return (p); } static void * pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { struct pglist alloctail; vm_offset_t addr, zkva; int cpu, flags; vm_page_t p, p_next; #ifdef NUMA struct pcpu *pc; #endif MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE); TAILQ_INIT(&alloctail); flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | malloc2vm_flags(wait); *pflag = UMA_SLAB_KERNEL; for (cpu = 0; cpu <= mp_maxid; cpu++) { if (CPU_ABSENT(cpu)) { p = vm_page_alloc(NULL, 0, flags); } else { #ifndef NUMA p = vm_page_alloc(NULL, 0, flags); #else pc = pcpu_find(cpu); if (__predict_false(VM_DOMAIN_EMPTY(pc->pc_domain))) p = NULL; else p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags); if (__predict_false(p == NULL)) p = vm_page_alloc(NULL, 0, flags); #endif } if (__predict_false(p == NULL)) goto fail; TAILQ_INSERT_TAIL(&alloctail, p, listq); } if ((addr = kva_alloc(bytes)) == 0) goto fail; zkva = addr; TAILQ_FOREACH(p, &alloctail, listq) { pmap_qenter(zkva, &p, 1); zkva += PAGE_SIZE; } return ((void*)addr); fail: TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { vm_page_unwire_noq(p); vm_page_free(p); } return (NULL); } /* * Allocates a number of pages from within an object * * Arguments: * bytes The number of bytes requested * wait Shall we wait? * * Returns: * A pointer to the alloced memory or possibly * NULL if M_NOWAIT is set. */ static void * noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait) { TAILQ_HEAD(, vm_page) alloctail; u_long npages; vm_offset_t retkva, zkva; vm_page_t p, p_next; uma_keg_t keg; TAILQ_INIT(&alloctail); keg = zone->uz_keg; npages = howmany(bytes, PAGE_SIZE); while (npages > 0) { p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK : VM_ALLOC_NOWAIT)); if (p != NULL) { /* * Since the page does not belong to an object, its * listq is unused. */ TAILQ_INSERT_TAIL(&alloctail, p, listq); npages--; continue; } /* * Page allocation failed, free intermediate pages and * exit. */ TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { vm_page_unwire_noq(p); vm_page_free(p); } return (NULL); } *flags = UMA_SLAB_PRIV; zkva = keg->uk_kva + atomic_fetchadd_long(&keg->uk_offset, round_page(bytes)); retkva = zkva; TAILQ_FOREACH(p, &alloctail, listq) { pmap_qenter(zkva, &p, 1); zkva += PAGE_SIZE; } return ((void *)retkva); } /* * Allocate physically contiguous pages. */ static void * contig_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { *pflag = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain), bytes, wait, 0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT)); } /* * Frees a number of pages to the system * * Arguments: * mem A pointer to the memory to be freed * size The size of the memory being freed * flags The original p->us_flags field * * Returns: * Nothing */ static void page_free(void *mem, vm_size_t size, uint8_t flags) { if ((flags & UMA_SLAB_BOOT) != 0) { startup_free(mem, size); return; } KASSERT((flags & UMA_SLAB_KERNEL) != 0, ("UMA: page_free used with invalid flags %x", flags)); kmem_free((vm_offset_t)mem, size); } /* * Frees pcpu zone allocations * * Arguments: * mem A pointer to the memory to be freed * size The size of the memory being freed * flags The original p->us_flags field * * Returns: * Nothing */ static void pcpu_page_free(void *mem, vm_size_t size, uint8_t flags) { vm_offset_t sva, curva; vm_paddr_t paddr; vm_page_t m; MPASS(size == (mp_maxid+1)*PAGE_SIZE); if ((flags & UMA_SLAB_BOOT) != 0) { startup_free(mem, size); return; } sva = (vm_offset_t)mem; for (curva = sva; curva < sva + size; curva += PAGE_SIZE) { paddr = pmap_kextract(curva); m = PHYS_TO_VM_PAGE(paddr); vm_page_unwire_noq(m); vm_page_free(m); } pmap_qremove(sva, size >> PAGE_SHIFT); kva_free(sva, size); } /* * Zero fill initializer * * Arguments/Returns follow uma_init specifications */ static int zero_init(void *mem, int size, int flags) { bzero(mem, size); return (0); } #ifdef INVARIANTS static struct noslabbits * slab_dbg_bits(uma_slab_t slab, uma_keg_t keg) { return ((void *)((char *)&slab->us_free + BITSET_SIZE(keg->uk_ipers))); } #endif /* * Actual size of embedded struct slab (!OFFPAGE). */ static size_t slab_sizeof(int nitems) { size_t s; s = sizeof(struct uma_slab) + BITSET_SIZE(nitems) * SLAB_BITSETS; return (roundup(s, UMA_ALIGN_PTR + 1)); } #define UMA_FIXPT_SHIFT 31 #define UMA_FRAC_FIXPT(n, d) \ ((uint32_t)(((uint64_t)(n) << UMA_FIXPT_SHIFT) / (d))) #define UMA_FIXPT_PCT(f) \ ((u_int)(((uint64_t)100 * (f)) >> UMA_FIXPT_SHIFT)) #define UMA_PCT_FIXPT(pct) UMA_FRAC_FIXPT((pct), 100) #define UMA_MIN_EFF UMA_PCT_FIXPT(100 - UMA_MAX_WASTE) /* * Compute the number of items that will fit in a slab. If hdr is true, the * item count may be limited to provide space in the slab for an inline slab * header. Otherwise, all slab space will be provided for item storage. */ static u_int slab_ipers_hdr(u_int size, u_int rsize, u_int slabsize, bool hdr) { u_int ipers; u_int padpi; /* The padding between items is not needed after the last item. */ padpi = rsize - size; if (hdr) { /* * Start with the maximum item count and remove items until * the slab header first alongside the allocatable memory. */ for (ipers = MIN(SLAB_MAX_SETSIZE, (slabsize + padpi - slab_sizeof(1)) / rsize); ipers > 0 && ipers * rsize - padpi + slab_sizeof(ipers) > slabsize; ipers--) continue; } else { ipers = MIN((slabsize + padpi) / rsize, SLAB_MAX_SETSIZE); } return (ipers); } struct keg_layout_result { u_int format; u_int slabsize; u_int ipers; u_int eff; }; static void keg_layout_one(uma_keg_t keg, u_int rsize, u_int slabsize, u_int fmt, struct keg_layout_result *kl) { u_int total; kl->format = fmt; kl->slabsize = slabsize; /* Handle INTERNAL as inline with an extra page. */ if ((fmt & UMA_ZFLAG_INTERNAL) != 0) { kl->format &= ~UMA_ZFLAG_INTERNAL; kl->slabsize += PAGE_SIZE; } kl->ipers = slab_ipers_hdr(keg->uk_size, rsize, kl->slabsize, (fmt & UMA_ZFLAG_OFFPAGE) == 0); /* Account for memory used by an offpage slab header. */ total = kl->slabsize; if ((fmt & UMA_ZFLAG_OFFPAGE) != 0) total += slabzone(kl->ipers)->uz_keg->uk_rsize; kl->eff = UMA_FRAC_FIXPT(kl->ipers * rsize, total); } /* * Determine the format of a uma keg. This determines where the slab header * will be placed (inline or offpage) and calculates ipers, rsize, and ppera. * * Arguments * keg The zone we should initialize * * Returns * Nothing */ static void keg_layout(uma_keg_t keg) { struct keg_layout_result kl = {}, kl_tmp; u_int fmts[2]; u_int alignsize; u_int nfmt; u_int pages; u_int rsize; u_int slabsize; u_int i, j; KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 || (keg->uk_size <= UMA_PCPU_ALLOC_SIZE && (keg->uk_flags & UMA_ZONE_CACHESPREAD) == 0), ("%s: cannot configure for PCPU: keg=%s, size=%u, flags=0x%b", __func__, keg->uk_name, keg->uk_size, keg->uk_flags, PRINT_UMA_ZFLAGS)); KASSERT((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) == 0 || (keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0, ("%s: incompatible flags 0x%b", __func__, keg->uk_flags, PRINT_UMA_ZFLAGS)); alignsize = keg->uk_align + 1; /* * Calculate the size of each allocation (rsize) according to * alignment. If the requested size is smaller than we have * allocation bits for we round it up. */ rsize = MAX(keg->uk_size, UMA_SMALLEST_UNIT); rsize = roundup2(rsize, alignsize); if ((keg->uk_flags & UMA_ZONE_CACHESPREAD) != 0) { /* * We want one item to start on every align boundary in a page. * To do this we will span pages. We will also extend the item * by the size of align if it is an even multiple of align. * Otherwise, it would fall on the same boundary every time. */ if ((rsize & alignsize) == 0) rsize += alignsize; slabsize = rsize * (PAGE_SIZE / alignsize); slabsize = MIN(slabsize, rsize * SLAB_MAX_SETSIZE); slabsize = MIN(slabsize, UMA_CACHESPREAD_MAX_SIZE); slabsize = round_page(slabsize); } else { /* * Start with a slab size of as many pages as it takes to * represent a single item. We will try to fit as many * additional items into the slab as possible. */ slabsize = round_page(keg->uk_size); } /* Build a list of all of the available formats for this keg. */ nfmt = 0; /* Evaluate an inline slab layout. */ if ((keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0) fmts[nfmt++] = 0; /* TODO: vm_page-embedded slab. */ /* * We can't do OFFPAGE if we're internal or if we've been * asked to not go to the VM for buckets. If we do this we * may end up going to the VM for slabs which we do not want * to do if we're UMA_ZONE_VM, which clearly forbids it. * In those cases, evaluate a pseudo-format called INTERNAL * which has an inline slab header and one extra page to * guarantee that it fits. * * Otherwise, see if using an OFFPAGE slab will improve our * efficiency. */ if ((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) != 0) fmts[nfmt++] = UMA_ZFLAG_INTERNAL; else fmts[nfmt++] = UMA_ZFLAG_OFFPAGE; /* * Choose a slab size and format which satisfy the minimum efficiency. * Prefer the smallest slab size that meets the constraints. * * Start with a minimum slab size, to accommodate CACHESPREAD. Then, * for small items (up to PAGE_SIZE), the iteration increment is one * page; and for large items, the increment is one item. */ i = (slabsize + rsize - keg->uk_size) / MAX(PAGE_SIZE, rsize); KASSERT(i >= 1, ("keg %s(%p) flags=0x%b slabsize=%u, rsize=%u, i=%u", keg->uk_name, keg, keg->uk_flags, PRINT_UMA_ZFLAGS, slabsize, rsize, i)); for ( ; ; i++) { slabsize = (rsize <= PAGE_SIZE) ? ptoa(i) : round_page(rsize * (i - 1) + keg->uk_size); for (j = 0; j < nfmt; j++) { /* Only if we have no viable format yet. */ if ((fmts[j] & UMA_ZFLAG_INTERNAL) != 0 && kl.ipers > 0) continue; keg_layout_one(keg, rsize, slabsize, fmts[j], &kl_tmp); if (kl_tmp.eff <= kl.eff) continue; kl = kl_tmp; CTR6(KTR_UMA, "keg %s layout: format %#x " "(ipers %u * rsize %u) / slabsize %#x = %u%% eff", keg->uk_name, kl.format, kl.ipers, rsize, kl.slabsize, UMA_FIXPT_PCT(kl.eff)); /* Stop when we reach the minimum efficiency. */ if (kl.eff >= UMA_MIN_EFF) break; } if (kl.eff >= UMA_MIN_EFF || !multipage_slabs || slabsize >= SLAB_MAX_SETSIZE * rsize || (keg->uk_flags & (UMA_ZONE_PCPU | UMA_ZONE_CONTIG)) != 0) break; } pages = atop(kl.slabsize); if ((keg->uk_flags & UMA_ZONE_PCPU) != 0) pages *= mp_maxid + 1; keg->uk_rsize = rsize; keg->uk_ipers = kl.ipers; keg->uk_ppera = pages; keg->uk_flags |= kl.format; /* * How do we find the slab header if it is offpage or if not all item * start addresses are in the same page? We could solve the latter * case with vaddr alignment, but we don't. */ if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0 || (keg->uk_ipers - 1) * rsize >= PAGE_SIZE) { if ((keg->uk_flags & UMA_ZONE_NOTPAGE) != 0) keg->uk_flags |= UMA_ZFLAG_HASH; else keg->uk_flags |= UMA_ZFLAG_VTOSLAB; } CTR6(KTR_UMA, "%s: keg=%s, flags=%#x, rsize=%u, ipers=%u, ppera=%u", __func__, keg->uk_name, keg->uk_flags, rsize, keg->uk_ipers, pages); KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE, ("%s: keg=%s, flags=0x%b, rsize=%u, ipers=%u, ppera=%u", __func__, keg->uk_name, keg->uk_flags, PRINT_UMA_ZFLAGS, rsize, keg->uk_ipers, pages)); } /* * Keg header ctor. This initializes all fields, locks, etc. And inserts * the keg onto the global keg list. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_kctor_args */ static int keg_ctor(void *mem, int size, void *udata, int flags) { struct uma_kctor_args *arg = udata; uma_keg_t keg = mem; uma_zone_t zone; int i; bzero(keg, size); keg->uk_size = arg->size; keg->uk_init = arg->uminit; keg->uk_fini = arg->fini; keg->uk_align = arg->align; keg->uk_reserve = 0; keg->uk_flags = arg->flags; /* * We use a global round-robin policy by default. Zones with * UMA_ZONE_FIRSTTOUCH set will use first-touch instead, in which * case the iterator is never run. */ keg->uk_dr.dr_policy = DOMAINSET_RR(); keg->uk_dr.dr_iter = 0; /* * The primary zone is passed to us at keg-creation time. */ zone = arg->zone; keg->uk_name = zone->uz_name; if (arg->flags & UMA_ZONE_ZINIT) keg->uk_init = zero_init; if (arg->flags & UMA_ZONE_MALLOC) keg->uk_flags |= UMA_ZFLAG_VTOSLAB; #ifndef SMP keg->uk_flags &= ~UMA_ZONE_PCPU; #endif keg_layout(keg); /* * Use a first-touch NUMA policy for kegs that pmap_extract() will * work on. Use round-robin for everything else. * * Zones may override the default by specifying either. */ #ifdef NUMA if ((keg->uk_flags & (UMA_ZONE_ROUNDROBIN | UMA_ZFLAG_CACHE | UMA_ZONE_NOTPAGE)) == 0) keg->uk_flags |= UMA_ZONE_FIRSTTOUCH; else if ((keg->uk_flags & UMA_ZONE_FIRSTTOUCH) == 0) keg->uk_flags |= UMA_ZONE_ROUNDROBIN; #endif /* * If we haven't booted yet we need allocations to go through the * startup cache until the vm is ready. */ #ifdef UMA_MD_SMALL_ALLOC if (keg->uk_ppera == 1) keg->uk_allocf = uma_small_alloc; else #endif if (booted < BOOT_KVA) keg->uk_allocf = startup_alloc; else if (keg->uk_flags & UMA_ZONE_PCPU) keg->uk_allocf = pcpu_page_alloc; else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1) keg->uk_allocf = contig_alloc; else keg->uk_allocf = page_alloc; #ifdef UMA_MD_SMALL_ALLOC if (keg->uk_ppera == 1) keg->uk_freef = uma_small_free; else #endif if (keg->uk_flags & UMA_ZONE_PCPU) keg->uk_freef = pcpu_page_free; else keg->uk_freef = page_free; /* * Initialize keg's locks. */ for (i = 0; i < vm_ndomains; i++) KEG_LOCK_INIT(keg, i, (arg->flags & UMA_ZONE_MTXCLASS)); /* * If we're putting the slab header in the actual page we need to * figure out where in each page it goes. See slab_sizeof * definition. */ if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) { size_t shsize; shsize = slab_sizeof(keg->uk_ipers); keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - shsize; /* * The only way the following is possible is if with our * UMA_ALIGN_PTR adjustments we are now bigger than * UMA_SLAB_SIZE. I haven't checked whether this is * mathematically possible for all cases, so we make * sure here anyway. */ KASSERT(keg->uk_pgoff + shsize <= PAGE_SIZE * keg->uk_ppera, ("zone %s ipers %d rsize %d size %d slab won't fit", zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size)); } if (keg->uk_flags & UMA_ZFLAG_HASH) hash_alloc(&keg->uk_hash, 0); CTR3(KTR_UMA, "keg_ctor %p zone %s(%p)", keg, zone->uz_name, zone); LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link); rw_wlock(&uma_rwlock); LIST_INSERT_HEAD(&uma_kegs, keg, uk_link); rw_wunlock(&uma_rwlock); return (0); } static void zone_kva_available(uma_zone_t zone, void *unused) { uma_keg_t keg; if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) return; KEG_GET(zone, keg); if (keg->uk_allocf == startup_alloc) { /* Switch to the real allocator. */ if (keg->uk_flags & UMA_ZONE_PCPU) keg->uk_allocf = pcpu_page_alloc; else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1) keg->uk_allocf = contig_alloc; else keg->uk_allocf = page_alloc; } } static void zone_alloc_counters(uma_zone_t zone, void *unused) { zone->uz_allocs = counter_u64_alloc(M_WAITOK); zone->uz_frees = counter_u64_alloc(M_WAITOK); zone->uz_fails = counter_u64_alloc(M_WAITOK); zone->uz_xdomain = counter_u64_alloc(M_WAITOK); } static void zone_alloc_sysctl(uma_zone_t zone, void *unused) { uma_zone_domain_t zdom; uma_domain_t dom; uma_keg_t keg; struct sysctl_oid *oid, *domainoid; int domains, i, cnt; static const char *nokeg = "cache zone"; char *c; /* * Make a sysctl safe copy of the zone name by removing * any special characters and handling dups by appending * an index. */ if (zone->uz_namecnt != 0) { /* Count the number of decimal digits and '_' separator. */ for (i = 1, cnt = zone->uz_namecnt; cnt != 0; i++) cnt /= 10; zone->uz_ctlname = malloc(strlen(zone->uz_name) + i + 1, M_UMA, M_WAITOK); sprintf(zone->uz_ctlname, "%s_%d", zone->uz_name, zone->uz_namecnt); } else zone->uz_ctlname = strdup(zone->uz_name, M_UMA); for (c = zone->uz_ctlname; *c != '\0'; c++) if (strchr("./\\ -", *c) != NULL) *c = '_'; /* * Basic parameters at the root. */ zone->uz_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm_uma), OID_AUTO, zone->uz_ctlname, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); oid = zone->uz_oid; SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "size", CTLFLAG_RD, &zone->uz_size, 0, "Allocation size"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "flags", CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_MPSAFE, zone, 0, sysctl_handle_uma_zone_flags, "A", "Allocator configuration flags"); SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "bucket_size", CTLFLAG_RD, &zone->uz_bucket_size, 0, "Desired per-cpu cache size"); SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "bucket_size_max", CTLFLAG_RD, &zone->uz_bucket_size_max, 0, "Maximum allowed per-cpu cache size"); /* * keg if present. */ if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0) domains = vm_ndomains; else domains = 1; oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "keg", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); keg = zone->uz_keg; if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) { SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "name", CTLFLAG_RD, keg->uk_name, "Keg name"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "rsize", CTLFLAG_RD, &keg->uk_rsize, 0, "Real object size with alignment"); SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "ppera", CTLFLAG_RD, &keg->uk_ppera, 0, "pages per-slab allocation"); SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "ipers", CTLFLAG_RD, &keg->uk_ipers, 0, "items available per-slab"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "align", CTLFLAG_RD, &keg->uk_align, 0, "item alignment mask"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "efficiency", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE, keg, 0, sysctl_handle_uma_slab_efficiency, "I", "Slab utilization (100 - internal fragmentation %)"); domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); for (i = 0; i < domains; i++) { dom = &keg->uk_domain[i]; oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid), OID_AUTO, VM_DOMAIN(i)->vmd_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "pages", CTLFLAG_RD, &dom->ud_pages, 0, "Total pages currently allocated from VM"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_items", CTLFLAG_RD, &dom->ud_free_items, 0, "items free in the slab layer"); } } else SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "name", CTLFLAG_RD, nokeg, "Keg name"); /* * Information about zone limits. */ oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "limit", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "items", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, zone, 0, sysctl_handle_uma_zone_items, "QU", "current number of allocated items if limit is set"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "max_items", CTLFLAG_RD, &zone->uz_max_items, 0, "Maximum number of cached items"); SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "sleepers", CTLFLAG_RD, &zone->uz_sleepers, 0, "Number of threads sleeping at limit"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "sleeps", CTLFLAG_RD, &zone->uz_sleeps, 0, "Total zone limit sleeps"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "bucket_max", CTLFLAG_RD, &zone->uz_bucket_max, 0, "Maximum number of items in each domain's bucket cache"); /* * Per-domain zone information. */ domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); for (i = 0; i < domains; i++) { zdom = ZDOM_GET(zone, i); oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid), OID_AUTO, VM_DOMAIN(i)->vmd_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "nitems", CTLFLAG_RD, &zdom->uzd_nitems, "number of items in this domain"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "imax", CTLFLAG_RD, &zdom->uzd_imax, "maximum item count in this period"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "imin", CTLFLAG_RD, &zdom->uzd_imin, "minimum item count in this period"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wss", CTLFLAG_RD, &zdom->uzd_wss, "Working set size"); } /* * General statistics. */ oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "current", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE, zone, 1, sysctl_handle_uma_zone_cur, "I", "Current number of allocated items"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "allocs", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, zone, 0, sysctl_handle_uma_zone_allocs, "QU", "Total allocation calls"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "frees", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, zone, 0, sysctl_handle_uma_zone_frees, "QU", "Total free calls"); SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "fails", CTLFLAG_RD, &zone->uz_fails, "Number of allocation failures"); SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "xdomain", CTLFLAG_RD, &zone->uz_xdomain, "Free calls from the wrong domain"); } struct uma_zone_count { const char *name; int count; }; static void zone_count(uma_zone_t zone, void *arg) { struct uma_zone_count *cnt; cnt = arg; /* * Some zones are rapidly created with identical names and * destroyed out of order. This can lead to gaps in the count. * Use one greater than the maximum observed for this name. */ if (strcmp(zone->uz_name, cnt->name) == 0) cnt->count = MAX(cnt->count, zone->uz_namecnt + 1); } static void zone_update_caches(uma_zone_t zone) { int i; for (i = 0; i <= mp_maxid; i++) { cache_set_uz_size(&zone->uz_cpu[i], zone->uz_size); cache_set_uz_flags(&zone->uz_cpu[i], zone->uz_flags); } } /* * Zone header ctor. This initializes all fields, locks, etc. * * Arguments/Returns follow uma_ctor specifications * udata Actually uma_zctor_args */ static int zone_ctor(void *mem, int size, void *udata, int flags) { struct uma_zone_count cnt; struct uma_zctor_args *arg = udata; uma_zone_domain_t zdom; uma_zone_t zone = mem; uma_zone_t z; uma_keg_t keg; int i; bzero(zone, size); zone->uz_name = arg->name; zone->uz_ctor = arg->ctor; zone->uz_dtor = arg->dtor; zone->uz_init = NULL; zone->uz_fini = NULL; zone->uz_sleeps = 0; zone->uz_bucket_size = 0; zone->uz_bucket_size_min = 0; zone->uz_bucket_size_max = BUCKET_MAX; zone->uz_flags = (arg->flags & UMA_ZONE_SMR); zone->uz_warning = NULL; /* The domain structures follow the cpu structures. */ zone->uz_bucket_max = ULONG_MAX; timevalclear(&zone->uz_ratecheck); /* Count the number of duplicate names. */ cnt.name = arg->name; cnt.count = 0; zone_foreach(zone_count, &cnt); zone->uz_namecnt = cnt.count; ZONE_CROSS_LOCK_INIT(zone); for (i = 0; i < vm_ndomains; i++) { zdom = ZDOM_GET(zone, i); ZDOM_LOCK_INIT(zone, zdom, (arg->flags & UMA_ZONE_MTXCLASS)); STAILQ_INIT(&zdom->uzd_buckets); } #ifdef INVARIANTS if (arg->uminit == trash_init && arg->fini == trash_fini) zone->uz_flags |= UMA_ZFLAG_TRASH | UMA_ZFLAG_CTORDTOR; #endif /* * This is a pure cache zone, no kegs. */ if (arg->import) { KASSERT((arg->flags & UMA_ZFLAG_CACHE) != 0, ("zone_ctor: Import specified for non-cache zone.")); zone->uz_flags = arg->flags; zone->uz_size = arg->size; zone->uz_import = arg->import; zone->uz_release = arg->release; zone->uz_arg = arg->arg; #ifdef NUMA /* * Cache zones are round-robin unless a policy is * specified because they may have incompatible * constraints. */ if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) == 0) zone->uz_flags |= UMA_ZONE_ROUNDROBIN; #endif rw_wlock(&uma_rwlock); LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link); rw_wunlock(&uma_rwlock); goto out; } /* * Use the regular zone/keg/slab allocator. */ zone->uz_import = zone_import; zone->uz_release = zone_release; zone->uz_arg = zone; keg = arg->keg; if (arg->flags & UMA_ZONE_SECONDARY) { KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0, ("Secondary zone requested UMA_ZFLAG_INTERNAL")); KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg")); zone->uz_init = arg->uminit; zone->uz_fini = arg->fini; zone->uz_flags |= UMA_ZONE_SECONDARY; rw_wlock(&uma_rwlock); ZONE_LOCK(zone); LIST_FOREACH(z, &keg->uk_zones, uz_link) { if (LIST_NEXT(z, uz_link) == NULL) { LIST_INSERT_AFTER(z, zone, uz_link); break; } } ZONE_UNLOCK(zone); rw_wunlock(&uma_rwlock); } else if (keg == NULL) { if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini, arg->align, arg->flags)) == NULL) return (ENOMEM); } else { struct uma_kctor_args karg; int error; /* We should only be here from uma_startup() */ karg.size = arg->size; karg.uminit = arg->uminit; karg.fini = arg->fini; karg.align = arg->align; karg.flags = (arg->flags & ~UMA_ZONE_SMR); karg.zone = zone; error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg, flags); if (error) return (error); } /* Inherit properties from the keg. */ zone->uz_keg = keg; zone->uz_size = keg->uk_size; zone->uz_flags |= (keg->uk_flags & (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT)); out: if (booted >= BOOT_PCPU) { zone_alloc_counters(zone, NULL); if (booted >= BOOT_RUNNING) zone_alloc_sysctl(zone, NULL); } else { zone->uz_allocs = EARLY_COUNTER; zone->uz_frees = EARLY_COUNTER; zone->uz_fails = EARLY_COUNTER; } /* Caller requests a private SMR context. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0) zone->uz_smr = smr_create(zone->uz_name, 0, 0); KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) != (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET), ("Invalid zone flag combination")); if (arg->flags & UMA_ZFLAG_INTERNAL) zone->uz_bucket_size_max = zone->uz_bucket_size = 0; if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0) zone->uz_bucket_size = BUCKET_MAX; else if ((arg->flags & UMA_ZONE_MINBUCKET) != 0) zone->uz_bucket_size_max = zone->uz_bucket_size = BUCKET_MIN; else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0) zone->uz_bucket_size = 0; else zone->uz_bucket_size = bucket_select(zone->uz_size); zone->uz_bucket_size_min = zone->uz_bucket_size; if (zone->uz_dtor != NULL || zone->uz_ctor != NULL) zone->uz_flags |= UMA_ZFLAG_CTORDTOR; zone_update_caches(zone); return (0); } /* * Keg header dtor. This frees all data, destroys locks, frees the hash * table and removes the keg from the global list. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void keg_dtor(void *arg, int size, void *udata) { uma_keg_t keg; uint32_t free, pages; int i; keg = (uma_keg_t)arg; free = pages = 0; for (i = 0; i < vm_ndomains; i++) { free += keg->uk_domain[i].ud_free_items; pages += keg->uk_domain[i].ud_pages; KEG_LOCK_FINI(keg, i); } if (pages != 0) printf("Freed UMA keg (%s) was not empty (%u items). " " Lost %u pages of memory.\n", keg->uk_name ? keg->uk_name : "", pages / keg->uk_ppera * keg->uk_ipers - free, pages); hash_free(&keg->uk_hash); } /* * Zone header dtor. * * Arguments/Returns follow uma_dtor specifications * udata unused */ static void zone_dtor(void *arg, int size, void *udata) { uma_zone_t zone; uma_keg_t keg; int i; zone = (uma_zone_t)arg; sysctl_remove_oid(zone->uz_oid, 1, 1); if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) cache_drain(zone); rw_wlock(&uma_rwlock); LIST_REMOVE(zone, uz_link); rw_wunlock(&uma_rwlock); zone_reclaim(zone, M_WAITOK, true); /* * We only destroy kegs from non secondary/non cache zones. */ if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) { keg = zone->uz_keg; rw_wlock(&uma_rwlock); LIST_REMOVE(keg, uk_link); rw_wunlock(&uma_rwlock); zone_free_item(kegs, keg, NULL, SKIP_NONE); } counter_u64_free(zone->uz_allocs); counter_u64_free(zone->uz_frees); counter_u64_free(zone->uz_fails); counter_u64_free(zone->uz_xdomain); free(zone->uz_ctlname, M_UMA); for (i = 0; i < vm_ndomains; i++) ZDOM_LOCK_FINI(ZDOM_GET(zone, i)); ZONE_CROSS_LOCK_FINI(zone); } static void zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *arg), void *arg) { uma_keg_t keg; uma_zone_t zone; LIST_FOREACH(keg, &uma_kegs, uk_link) { LIST_FOREACH(zone, &keg->uk_zones, uz_link) zfunc(zone, arg); } LIST_FOREACH(zone, &uma_cachezones, uz_link) zfunc(zone, arg); } /* * Traverses every zone in the system and calls a callback * * Arguments: * zfunc A pointer to a function which accepts a zone * as an argument. * * Returns: * Nothing */ static void zone_foreach(void (*zfunc)(uma_zone_t, void *arg), void *arg) { rw_rlock(&uma_rwlock); zone_foreach_unlocked(zfunc, arg); rw_runlock(&uma_rwlock); } /* * Initialize the kernel memory allocator. This is done after pages can be * allocated but before general KVA is available. */ void uma_startup1(vm_offset_t virtual_avail) { struct uma_zctor_args args; size_t ksize, zsize, size; uma_keg_t primarykeg; uintptr_t m; int domain; uint8_t pflag; bootstart = bootmem = virtual_avail; rw_init(&uma_rwlock, "UMA lock"); sx_init(&uma_reclaim_lock, "umareclaim"); ksize = sizeof(struct uma_keg) + (sizeof(struct uma_domain) * vm_ndomains); ksize = roundup(ksize, UMA_SUPER_ALIGN); zsize = sizeof(struct uma_zone) + (sizeof(struct uma_cache) * (mp_maxid + 1)) + (sizeof(struct uma_zone_domain) * vm_ndomains); zsize = roundup(zsize, UMA_SUPER_ALIGN); /* Allocate the zone of zones, zone of kegs, and zone of zones keg. */ size = (zsize * 2) + ksize; for (domain = 0; domain < vm_ndomains; domain++) { m = (uintptr_t)startup_alloc(NULL, size, domain, &pflag, M_NOWAIT | M_ZERO); if (m != 0) break; } zones = (uma_zone_t)m; m += zsize; kegs = (uma_zone_t)m; m += zsize; primarykeg = (uma_keg_t)m; /* "manually" create the initial zone */ memset(&args, 0, sizeof(args)); args.name = "UMA Kegs"; args.size = ksize; args.ctor = keg_ctor; args.dtor = keg_dtor; args.uminit = zero_init; args.fini = NULL; args.keg = primarykeg; args.align = UMA_SUPER_ALIGN - 1; args.flags = UMA_ZFLAG_INTERNAL; zone_ctor(kegs, zsize, &args, M_WAITOK); args.name = "UMA Zones"; args.size = zsize; args.ctor = zone_ctor; args.dtor = zone_dtor; args.uminit = zero_init; args.fini = NULL; args.keg = NULL; args.align = UMA_SUPER_ALIGN - 1; args.flags = UMA_ZFLAG_INTERNAL; zone_ctor(zones, zsize, &args, M_WAITOK); /* Now make zones for slab headers */ slabzones[0] = uma_zcreate("UMA Slabs 0", SLABZONE0_SIZE, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); slabzones[1] = uma_zcreate("UMA Slabs 1", SLABZONE1_SIZE, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); hashzone = uma_zcreate("UMA Hash", sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); bucket_init(); smr_init(); } #ifndef UMA_MD_SMALL_ALLOC extern void vm_radix_reserve_kva(void); #endif /* * Advertise the availability of normal kva allocations and switch to * the default back-end allocator. Marks the KVA we consumed on startup * as used in the map. */ void uma_startup2(void) { if (bootstart != bootmem) { vm_map_lock(kernel_map); (void)vm_map_insert(kernel_map, NULL, 0, bootstart, bootmem, VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT); vm_map_unlock(kernel_map); } #ifndef UMA_MD_SMALL_ALLOC /* Set up radix zone to use noobj_alloc. */ vm_radix_reserve_kva(); #endif booted = BOOT_KVA; zone_foreach_unlocked(zone_kva_available, NULL); bucket_enable(); } /* * Allocate counters as early as possible so that boot-time allocations are * accounted more precisely. */ static void uma_startup_pcpu(void *arg __unused) { zone_foreach_unlocked(zone_alloc_counters, NULL); booted = BOOT_PCPU; } SYSINIT(uma_startup_pcpu, SI_SUB_COUNTER, SI_ORDER_ANY, uma_startup_pcpu, NULL); /* * Finish our initialization steps. */ static void uma_startup3(void *arg __unused) { #ifdef INVARIANTS TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor); uma_dbg_cnt = counter_u64_alloc(M_WAITOK); uma_skip_cnt = counter_u64_alloc(M_WAITOK); #endif zone_foreach_unlocked(zone_alloc_sysctl, NULL); callout_init(&uma_callout, 1); callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); booted = BOOT_RUNNING; EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); static void uma_shutdown(void) { booted = BOOT_SHUTDOWN; } static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags) { struct uma_kctor_args args; args.size = size; args.uminit = uminit; args.fini = fini; args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align; args.flags = flags; args.zone = zone; return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK)); } /* Public functions */ /* See uma.h */ void uma_set_align(int align) { if (align != UMA_ALIGN_CACHE) uma_align_cache = align; } /* See uma.h */ uma_zone_t uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, uma_init uminit, uma_fini fini, int align, uint32_t flags) { struct uma_zctor_args args; uma_zone_t res; KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"", align, name)); /* This stuff is essential for the zone ctor */ memset(&args, 0, sizeof(args)); args.name = name; args.size = size; args.ctor = ctor; args.dtor = dtor; args.uminit = uminit; args.fini = fini; #ifdef INVARIANTS /* * Inject procedures which check for memory use after free if we are * allowed to scramble the memory while it is not allocated. This * requires that: UMA is actually able to access the memory, no init * or fini procedures, no dependency on the initial value of the * memory, and no (legitimate) use of the memory after free. Note, * the ctor and dtor do not need to be empty. */ if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOTOUCH | UMA_ZONE_NOFREE))) && uminit == NULL && fini == NULL) { args.uminit = trash_init; args.fini = trash_fini; } #endif args.align = align; args.flags = flags; args.keg = NULL; sx_slock(&uma_reclaim_lock); res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); sx_sunlock(&uma_reclaim_lock); return (res); } /* See uma.h */ uma_zone_t uma_zsecond_create(const char *name, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_zone_t primary) { struct uma_zctor_args args; uma_keg_t keg; uma_zone_t res; keg = primary->uz_keg; memset(&args, 0, sizeof(args)); args.name = name; args.size = keg->uk_size; args.ctor = ctor; args.dtor = dtor; args.uminit = zinit; args.fini = zfini; args.align = keg->uk_align; args.flags = keg->uk_flags | UMA_ZONE_SECONDARY; args.keg = keg; sx_slock(&uma_reclaim_lock); res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); sx_sunlock(&uma_reclaim_lock); return (res); } /* See uma.h */ uma_zone_t uma_zcache_create(const char *name, int size, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease, void *arg, int flags) { struct uma_zctor_args args; memset(&args, 0, sizeof(args)); args.name = name; args.size = size; args.ctor = ctor; args.dtor = dtor; args.uminit = zinit; args.fini = zfini; args.import = zimport; args.release = zrelease; args.arg = arg; args.align = 0; args.flags = flags | UMA_ZFLAG_CACHE; return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK)); } /* See uma.h */ void uma_zdestroy(uma_zone_t zone) { /* * Large slabs are expensive to reclaim, so don't bother doing * unnecessary work if we're shutting down. */ if (booted == BOOT_SHUTDOWN && zone->uz_fini == NULL && zone->uz_release == zone_release) return; sx_slock(&uma_reclaim_lock); zone_free_item(zones, zone, NULL, SKIP_NONE); sx_sunlock(&uma_reclaim_lock); } void uma_zwait(uma_zone_t zone) { if ((zone->uz_flags & UMA_ZONE_SMR) != 0) uma_zfree_smr(zone, uma_zalloc_smr(zone, M_WAITOK)); else if ((zone->uz_flags & UMA_ZONE_PCPU) != 0) uma_zfree_pcpu(zone, uma_zalloc_pcpu(zone, M_WAITOK)); else uma_zfree(zone, uma_zalloc(zone, M_WAITOK)); } void * uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags) { void *item, *pcpu_item; #ifdef SMP int i; MPASS(zone->uz_flags & UMA_ZONE_PCPU); #endif item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO); if (item == NULL) return (NULL); pcpu_item = zpcpu_base_to_offset(item); if (flags & M_ZERO) { #ifdef SMP for (i = 0; i <= mp_maxid; i++) bzero(zpcpu_get_cpu(pcpu_item, i), zone->uz_size); #else bzero(item, zone->uz_size); #endif } return (pcpu_item); } /* * A stub while both regular and pcpu cases are identical. */ void uma_zfree_pcpu_arg(uma_zone_t zone, void *pcpu_item, void *udata) { void *item; #ifdef SMP MPASS(zone->uz_flags & UMA_ZONE_PCPU); #endif item = zpcpu_offset_to_base(pcpu_item); uma_zfree_arg(zone, item, udata); } static inline void * item_ctor(uma_zone_t zone, int uz_flags, int size, void *udata, int flags, void *item) { #ifdef INVARIANTS bool skipdbg; skipdbg = uma_dbg_zskip(zone, item); if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 && zone->uz_ctor != trash_ctor) trash_ctor(item, size, udata, flags); #endif /* Check flags before loading ctor pointer. */ if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0) && __predict_false(zone->uz_ctor != NULL) && zone->uz_ctor(item, size, udata, flags) != 0) { counter_u64_add(zone->uz_fails, 1); zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT); return (NULL); } #ifdef INVARIANTS if (!skipdbg) uma_dbg_alloc(zone, NULL, item); #endif if (__predict_false(flags & M_ZERO)) return (memset(item, 0, size)); return (item); } static inline void item_dtor(uma_zone_t zone, void *item, int size, void *udata, enum zfreeskip skip) { #ifdef INVARIANTS bool skipdbg; skipdbg = uma_dbg_zskip(zone, item); if (skip == SKIP_NONE && !skipdbg) { if ((zone->uz_flags & UMA_ZONE_MALLOC) != 0) uma_dbg_free(zone, udata, item); else uma_dbg_free(zone, NULL, item); } #endif if (__predict_true(skip < SKIP_DTOR)) { if (zone->uz_dtor != NULL) zone->uz_dtor(item, size, udata); #ifdef INVARIANTS if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 && zone->uz_dtor != trash_dtor) trash_dtor(item, size, udata); #endif } } #ifdef NUMA static int item_domain(void *item) { int domain; domain = _vm_phys_domain(vtophys(item)); KASSERT(domain >= 0 && domain < vm_ndomains, ("%s: unknown domain for item %p", __func__, item)); return (domain); } #endif #if defined(INVARIANTS) || defined(DEBUG_MEMGUARD) || defined(WITNESS) #define UMA_ZALLOC_DEBUG static int uma_zalloc_debug(uma_zone_t zone, void **itemp, void *udata, int flags) { int error; error = 0; #ifdef WITNESS if (flags & M_WAITOK) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "uma_zalloc_debug: zone \"%s\"", zone->uz_name); } #endif #ifdef INVARIANTS KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_debug: called with M_EXEC")); KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("uma_zalloc_debug: called within spinlock or critical section")); KASSERT((zone->uz_flags & UMA_ZONE_PCPU) == 0 || (flags & M_ZERO) == 0, ("uma_zalloc_debug: allocating from a pcpu zone with M_ZERO")); #endif #ifdef DEBUG_MEMGUARD if ((zone->uz_flags & UMA_ZONE_SMR) == 0 && memguard_cmp_zone(zone)) { void *item; item = memguard_alloc(zone->uz_size, flags); if (item != NULL) { error = EJUSTRETURN; if (zone->uz_init != NULL && zone->uz_init(item, zone->uz_size, flags) != 0) { *itemp = NULL; return (error); } if (zone->uz_ctor != NULL && zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { counter_u64_add(zone->uz_fails, 1); zone->uz_fini(item, zone->uz_size); *itemp = NULL; return (error); } *itemp = item; return (error); } /* This is unfortunate but should not be fatal. */ } #endif return (error); } static int uma_zfree_debug(uma_zone_t zone, void *item, void *udata) { KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("uma_zfree_debug: called with spinlock or critical section held")); #ifdef DEBUG_MEMGUARD if ((zone->uz_flags & UMA_ZONE_SMR) == 0 && is_memguard_addr(item)) { if (zone->uz_dtor != NULL) zone->uz_dtor(item, zone->uz_size, udata); if (zone->uz_fini != NULL) zone->uz_fini(item, zone->uz_size); memguard_free(item); return (EJUSTRETURN); } #endif return (0); } #endif static inline void * cache_alloc_item(uma_zone_t zone, uma_cache_t cache, uma_cache_bucket_t bucket, void *udata, int flags) { void *item; int size, uz_flags; item = cache_bucket_pop(cache, bucket); size = cache_uz_size(cache); uz_flags = cache_uz_flags(cache); critical_exit(); return (item_ctor(zone, uz_flags, size, udata, flags, item)); } static __noinline void * cache_alloc_retry(uma_zone_t zone, uma_cache_t cache, void *udata, int flags) { uma_cache_bucket_t bucket; int domain; while (cache_alloc(zone, cache, udata, flags)) { cache = &zone->uz_cpu[curcpu]; bucket = &cache->uc_allocbucket; if (__predict_false(bucket->ucb_cnt == 0)) continue; return (cache_alloc_item(zone, cache, bucket, udata, flags)); } critical_exit(); /* * We can not get a bucket so try to return a single item. */ if (zone->uz_flags & UMA_ZONE_FIRSTTOUCH) domain = PCPU_GET(domain); else domain = UMA_ANYDOMAIN; return (zone_alloc_item(zone, udata, domain, flags)); } /* See uma.h */ void * uma_zalloc_smr(uma_zone_t zone, int flags) { uma_cache_bucket_t bucket; uma_cache_t cache; #ifdef UMA_ZALLOC_DEBUG void *item; KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0, ("uma_zalloc_arg: called with non-SMR zone.")); if (uma_zalloc_debug(zone, &item, NULL, flags) == EJUSTRETURN) return (item); #endif critical_enter(); cache = &zone->uz_cpu[curcpu]; bucket = &cache->uc_allocbucket; if (__predict_false(bucket->ucb_cnt == 0)) return (cache_alloc_retry(zone, cache, NULL, flags)); return (cache_alloc_item(zone, cache, bucket, NULL, flags)); } /* See uma.h */ void * uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) { uma_cache_bucket_t bucket; uma_cache_t cache; /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); /* This is the fast path allocation */ CTR3(KTR_UMA, "uma_zalloc_arg zone %s(%p) flags %d", zone->uz_name, zone, flags); #ifdef UMA_ZALLOC_DEBUG void *item; KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, ("uma_zalloc_arg: called with SMR zone.")); if (uma_zalloc_debug(zone, &item, udata, flags) == EJUSTRETURN) return (item); #endif /* * If possible, allocate from the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread * accessing the cache must not be preempted or yield during access, * and (2) the thread must not migrate CPUs without switching which * cache it accesses. We rely on a critical section to prevent * preemption and migration. We release the critical section in * order to acquire the zone mutex if we are unable to allocate from * the current cache; when we re-acquire the critical section, we * must detect and handle migration if it has occurred. */ critical_enter(); cache = &zone->uz_cpu[curcpu]; bucket = &cache->uc_allocbucket; if (__predict_false(bucket->ucb_cnt == 0)) return (cache_alloc_retry(zone, cache, udata, flags)); return (cache_alloc_item(zone, cache, bucket, udata, flags)); } /* * Replenish an alloc bucket and possibly restore an old one. Called in * a critical section. Returns in a critical section. * * A false return value indicates an allocation failure. * A true return value indicates success and the caller should retry. */ static __noinline bool cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags) { uma_bucket_t bucket; int curdomain, domain; bool new; CRITICAL_ASSERT(curthread); /* * If we have run out of items in our alloc bucket see * if we can switch with the free bucket. * * SMR Zones can't re-use the free bucket until the sequence has * expired. */ if ((cache_uz_flags(cache) & UMA_ZONE_SMR) == 0 && cache->uc_freebucket.ucb_cnt != 0) { cache_bucket_swap(&cache->uc_freebucket, &cache->uc_allocbucket); return (true); } /* * Discard any empty allocation bucket while we hold no locks. */ bucket = cache_bucket_unload_alloc(cache); critical_exit(); if (bucket != NULL) { KASSERT(bucket->ub_cnt == 0, ("cache_alloc: Entered with non-empty alloc bucket.")); bucket_free(zone, bucket, udata); } /* * Attempt to retrieve the item from the per-CPU cache has failed, so * we must go back to the zone. This requires the zdom lock, so we * must drop the critical section, then re-acquire it when we go back * to the cache. Since the critical section is released, we may be * preempted or migrate. As such, make sure not to maintain any * thread-local state specific to the cache from prior to releasing * the critical section. */ domain = PCPU_GET(domain); if ((cache_uz_flags(cache) & UMA_ZONE_ROUNDROBIN) != 0 || VM_DOMAIN_EMPTY(domain)) domain = zone_domain_highest(zone, domain); bucket = cache_fetch_bucket(zone, cache, domain); if (bucket == NULL && zone->uz_bucket_size != 0 && !bucketdisable) { bucket = zone_alloc_bucket(zone, udata, domain, flags); new = true; } else { new = false; } CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p", zone->uz_name, zone, bucket); if (bucket == NULL) { critical_enter(); return (false); } /* * See if we lost the race or were migrated. Cache the * initialized bucket to make this less likely or claim * the memory directly. */ critical_enter(); cache = &zone->uz_cpu[curcpu]; if (cache->uc_allocbucket.ucb_bucket == NULL && ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) == 0 || (curdomain = PCPU_GET(domain)) == domain || VM_DOMAIN_EMPTY(curdomain))) { if (new) atomic_add_long(&ZDOM_GET(zone, domain)->uzd_imax, bucket->ub_cnt); cache_bucket_load_alloc(cache, bucket); return (true); } /* * We lost the race, release this bucket and start over. */ critical_exit(); zone_put_bucket(zone, domain, bucket, udata, false); critical_enter(); return (true); } void * uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags) { #ifdef NUMA uma_bucket_t bucket; uma_zone_domain_t zdom; void *item; #endif /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); /* This is the fast path allocation */ CTR4(KTR_UMA, "uma_zalloc_domain zone %s(%p) domain %d flags %d", zone->uz_name, zone, domain, flags); if (flags & M_WAITOK) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "uma_zalloc_domain: zone \"%s\"", zone->uz_name); } KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), ("uma_zalloc_domain: called with spinlock or critical section held")); KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, ("uma_zalloc_domain: called with SMR zone.")); #ifdef NUMA KASSERT((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0, ("uma_zalloc_domain: called with non-FIRSTTOUCH zone.")); if (vm_ndomains == 1) return (uma_zalloc_arg(zone, udata, flags)); /* * Try to allocate from the bucket cache before falling back to the keg. * We could try harder and attempt to allocate from per-CPU caches or * the per-domain cross-domain buckets, but the complexity is probably * not worth it. It is more important that frees of previous * cross-domain allocations do not blow up the cache. */ zdom = zone_domain_lock(zone, domain); if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL) { item = bucket->ub_bucket[bucket->ub_cnt - 1]; #ifdef INVARIANTS bucket->ub_bucket[bucket->ub_cnt - 1] = NULL; #endif bucket->ub_cnt--; zone_put_bucket(zone, domain, bucket, udata, true); item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata, flags, item); if (item != NULL) { KASSERT(item_domain(item) == domain, ("%s: bucket cache item %p from wrong domain", __func__, item)); counter_u64_add(zone->uz_allocs, 1); } return (item); } ZDOM_UNLOCK(zdom); return (zone_alloc_item(zone, udata, domain, flags)); #else return (uma_zalloc_arg(zone, udata, flags)); #endif } /* * Find a slab with some space. Prefer slabs that are partially used over those * that are totally full. This helps to reduce fragmentation. * * If 'rr' is 1, search all domains starting from 'domain'. Otherwise check * only 'domain'. */ static uma_slab_t keg_first_slab(uma_keg_t keg, int domain, bool rr) { uma_domain_t dom; uma_slab_t slab; int start; KASSERT(domain >= 0 && domain < vm_ndomains, ("keg_first_slab: domain %d out of range", domain)); KEG_LOCK_ASSERT(keg, domain); slab = NULL; start = domain; do { dom = &keg->uk_domain[domain]; if ((slab = LIST_FIRST(&dom->ud_part_slab)) != NULL) return (slab); if ((slab = LIST_FIRST(&dom->ud_free_slab)) != NULL) { LIST_REMOVE(slab, us_link); dom->ud_free_slabs--; LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); return (slab); } if (rr) domain = (domain + 1) % vm_ndomains; } while (domain != start); return (NULL); } /* * Fetch an existing slab from a free or partial list. Returns with the * keg domain lock held if a slab was found or unlocked if not. */ static uma_slab_t keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags) { uma_slab_t slab; uint32_t reserve; /* HASH has a single free list. */ if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) domain = 0; KEG_LOCK(keg, domain); reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve; if (keg->uk_domain[domain].ud_free_items <= reserve || (slab = keg_first_slab(keg, domain, rr)) == NULL) { KEG_UNLOCK(keg, domain); return (NULL); } return (slab); } static uma_slab_t keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags) { struct vm_domainset_iter di; uma_slab_t slab; int aflags, domain; bool rr; restart: /* * Use the keg's policy if upper layers haven't already specified a * domain (as happens with first-touch zones). * * To avoid races we run the iterator with the keg lock held, but that * means that we cannot allow the vm_domainset layer to sleep. Thus, * clear M_WAITOK and handle low memory conditions locally. */ rr = rdomain == UMA_ANYDOMAIN; if (rr) { aflags = (flags & ~M_WAITOK) | M_NOWAIT; vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, &aflags); } else { aflags = flags; domain = rdomain; } for (;;) { slab = keg_fetch_free_slab(keg, domain, rr, flags); if (slab != NULL) return (slab); /* * M_NOVM means don't ask at all! */ if (flags & M_NOVM) break; slab = keg_alloc_slab(keg, zone, domain, flags, aflags); if (slab != NULL) return (slab); if (!rr && (flags & M_WAITOK) == 0) break; if (rr && vm_domainset_iter_policy(&di, &domain) != 0) { if ((flags & M_WAITOK) != 0) { vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0); goto restart; } break; } } /* * We might not have been able to get a slab but another cpu * could have while we were unlocked. Check again before we * fail. */ if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) return (slab); return (NULL); } static void * slab_alloc_item(uma_keg_t keg, uma_slab_t slab) { uma_domain_t dom; void *item; int freei; KEG_LOCK_ASSERT(keg, slab->us_domain); dom = &keg->uk_domain[slab->us_domain]; freei = BIT_FFS(keg->uk_ipers, &slab->us_free) - 1; BIT_CLR(keg->uk_ipers, freei, &slab->us_free); item = slab_item(slab, keg, freei); slab->us_freecount--; dom->ud_free_items--; /* * Move this slab to the full list. It must be on the partial list, so * we do not need to update the free slab count. In particular, * keg_fetch_slab() always returns slabs on the partial list. */ if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link); } return (item); } static int zone_import(void *arg, void **bucket, int max, int domain, int flags) { uma_domain_t dom; uma_zone_t zone; uma_slab_t slab; uma_keg_t keg; #ifdef NUMA int stripe; #endif int i; zone = arg; slab = NULL; keg = zone->uz_keg; /* Try to keep the buckets totally full */ for (i = 0; i < max; ) { if ((slab = keg_fetch_slab(keg, zone, domain, flags)) == NULL) break; #ifdef NUMA stripe = howmany(max, vm_ndomains); #endif dom = &keg->uk_domain[slab->us_domain]; while (slab->us_freecount && i < max) { bucket[i++] = slab_alloc_item(keg, slab); if (dom->ud_free_items <= keg->uk_reserve) break; #ifdef NUMA /* * If the zone is striped we pick a new slab for every * N allocations. Eliminating this conditional will * instead pick a new domain for each bucket rather * than stripe within each bucket. The current option * produces more fragmentation and requires more cpu * time but yields better distribution. */ if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0 && vm_ndomains > 1 && --stripe == 0) break; #endif } KEG_UNLOCK(keg, slab->us_domain); /* Don't block if we allocated any successfully. */ flags &= ~M_WAITOK; flags |= M_NOWAIT; } return i; } static int zone_alloc_limit_hard(uma_zone_t zone, int count, int flags) { uint64_t old, new, total, max; /* * The hard case. We're going to sleep because there were existing * sleepers or because we ran out of items. This routine enforces * fairness by keeping fifo order. * * First release our ill gotten gains and make some noise. */ for (;;) { zone_free_limit(zone, count); zone_log_warning(zone); zone_maxaction(zone); if (flags & M_NOWAIT) return (0); /* * We need to allocate an item or set ourself as a sleeper * while the sleepq lock is held to avoid wakeup races. This * is essentially a home rolled semaphore. */ sleepq_lock(&zone->uz_max_items); old = zone->uz_items; do { MPASS(UZ_ITEMS_SLEEPERS(old) < UZ_ITEMS_SLEEPERS_MAX); /* Cache the max since we will evaluate twice. */ max = zone->uz_max_items; if (UZ_ITEMS_SLEEPERS(old) != 0 || UZ_ITEMS_COUNT(old) >= max) new = old + UZ_ITEMS_SLEEPER; else new = old + MIN(count, max - old); } while (atomic_fcmpset_64(&zone->uz_items, &old, new) == 0); /* We may have successfully allocated under the sleepq lock. */ if (UZ_ITEMS_SLEEPERS(new) == 0) { sleepq_release(&zone->uz_max_items); return (new - old); } /* * This is in a different cacheline from uz_items so that we * don't constantly invalidate the fastpath cacheline when we * adjust item counts. This could be limited to toggling on * transitions. */ atomic_add_32(&zone->uz_sleepers, 1); atomic_add_64(&zone->uz_sleeps, 1); /* * We have added ourselves as a sleeper. The sleepq lock * protects us from wakeup races. Sleep now and then retry. */ sleepq_add(&zone->uz_max_items, NULL, "zonelimit", 0, 0); sleepq_wait(&zone->uz_max_items, PVM); /* * After wakeup, remove ourselves as a sleeper and try * again. We no longer have the sleepq lock for protection. * * Subract ourselves as a sleeper while attempting to add * our count. */ atomic_subtract_32(&zone->uz_sleepers, 1); old = atomic_fetchadd_64(&zone->uz_items, -(UZ_ITEMS_SLEEPER - count)); /* We're no longer a sleeper. */ old -= UZ_ITEMS_SLEEPER; /* * If we're still at the limit, restart. Notably do not * block on other sleepers. Cache the max value to protect * against changes via sysctl. */ total = UZ_ITEMS_COUNT(old); max = zone->uz_max_items; if (total >= max) continue; /* Truncate if necessary, otherwise wake other sleepers. */ if (total + count > max) { zone_free_limit(zone, total + count - max); count = max - total; } else if (total + count < max && UZ_ITEMS_SLEEPERS(old) != 0) wakeup_one(&zone->uz_max_items); return (count); } } /* * Allocate 'count' items from our max_items limit. Returns the number * available. If M_NOWAIT is not specified it will sleep until at least * one item can be allocated. */ static int zone_alloc_limit(uma_zone_t zone, int count, int flags) { uint64_t old; uint64_t max; max = zone->uz_max_items; MPASS(max > 0); /* * We expect normal allocations to succeed with a simple * fetchadd. */ old = atomic_fetchadd_64(&zone->uz_items, count); if (__predict_true(old + count <= max)) return (count); /* * If we had some items and no sleepers just return the * truncated value. We have to release the excess space * though because that may wake sleepers who weren't woken * because we were temporarily over the limit. */ if (old < max) { zone_free_limit(zone, (old + count) - max); return (max - old); } return (zone_alloc_limit_hard(zone, count, flags)); } /* * Free a number of items back to the limit. */ static void zone_free_limit(uma_zone_t zone, int count) { uint64_t old; MPASS(count > 0); /* * In the common case we either have no sleepers or * are still over the limit and can just return. */ old = atomic_fetchadd_64(&zone->uz_items, -count); if (__predict_true(UZ_ITEMS_SLEEPERS(old) == 0 || UZ_ITEMS_COUNT(old) - count >= zone->uz_max_items)) return; /* * Moderate the rate of wakeups. Sleepers will continue * to generate wakeups if necessary. */ wakeup_one(&zone->uz_max_items); } static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) { uma_bucket_t bucket; int maxbucket, cnt; CTR3(KTR_UMA, "zone_alloc_bucket zone %s(%p) domain %d", zone->uz_name, zone, domain); /* Avoid allocs targeting empty domains. */ if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) domain = UMA_ANYDOMAIN; else if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0) domain = UMA_ANYDOMAIN; if (zone->uz_max_items > 0) maxbucket = zone_alloc_limit(zone, zone->uz_bucket_size, M_NOWAIT); else maxbucket = zone->uz_bucket_size; if (maxbucket == 0) return (false); /* Don't wait for buckets, preserve caller's NOVM setting. */ bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); if (bucket == NULL) { cnt = 0; goto out; } bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket, MIN(maxbucket, bucket->ub_entries), domain, flags); /* * Initialize the memory if necessary. */ if (bucket->ub_cnt != 0 && zone->uz_init != NULL) { int i; for (i = 0; i < bucket->ub_cnt; i++) if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size, flags) != 0) break; /* * If we couldn't initialize the whole bucket, put the * rest back onto the freelist. */ if (i != bucket->ub_cnt) { zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i], bucket->ub_cnt - i); #ifdef INVARIANTS bzero(&bucket->ub_bucket[i], sizeof(void *) * (bucket->ub_cnt - i)); #endif bucket->ub_cnt = i; } } cnt = bucket->ub_cnt; if (bucket->ub_cnt == 0) { bucket_free(zone, bucket, udata); counter_u64_add(zone->uz_fails, 1); bucket = NULL; } out: if (zone->uz_max_items > 0 && cnt < maxbucket) zone_free_limit(zone, maxbucket - cnt); return (bucket); } /* * Allocates a single item from a zone. * * Arguments * zone The zone to alloc for. * udata The data to be passed to the constructor. * domain The domain to allocate from or UMA_ANYDOMAIN. * flags M_WAITOK, M_NOWAIT, M_ZERO. * * Returns * NULL if there is no memory and M_NOWAIT is set * An item if successful */ static void * zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) { void *item; if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0) { counter_u64_add(zone->uz_fails, 1); return (NULL); } /* Avoid allocs targeting empty domains. */ if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) domain = UMA_ANYDOMAIN; if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) goto fail_cnt; /* * We have to call both the zone's init (not the keg's init) * and the zone's ctor. This is because the item is going from * a keg slab directly to the user, and the user is expecting it * to be both zone-init'd as well as zone-ctor'd. */ if (zone->uz_init != NULL) { if (zone->uz_init(item, zone->uz_size, flags) != 0) { zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT); goto fail_cnt; } } item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata, flags, item); if (item == NULL) goto fail; counter_u64_add(zone->uz_allocs, 1); CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item, zone->uz_name, zone); return (item); fail_cnt: counter_u64_add(zone->uz_fails, 1); fail: if (zone->uz_max_items > 0) zone_free_limit(zone, 1); CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)", zone->uz_name, zone); return (NULL); } /* See uma.h */ void uma_zfree_smr(uma_zone_t zone, void *item) { uma_cache_t cache; uma_cache_bucket_t bucket; int itemdomain, uz_flags; #ifdef UMA_ZALLOC_DEBUG KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0, ("uma_zfree_smr: called with non-SMR zone.")); KASSERT(item != NULL, ("uma_zfree_smr: Called with NULL pointer.")); SMR_ASSERT_NOT_ENTERED(zone->uz_smr); if (uma_zfree_debug(zone, item, NULL) == EJUSTRETURN) return; #endif cache = &zone->uz_cpu[curcpu]; uz_flags = cache_uz_flags(cache); itemdomain = 0; #ifdef NUMA if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) itemdomain = item_domain(item); #endif critical_enter(); do { cache = &zone->uz_cpu[curcpu]; /* SMR Zones must free to the free bucket. */ bucket = &cache->uc_freebucket; #ifdef NUMA if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && PCPU_GET(domain) != itemdomain) { bucket = &cache->uc_crossbucket; } #endif if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) { cache_bucket_push(cache, bucket, item); critical_exit(); return; } } while (cache_free(zone, cache, NULL, item, itemdomain)); critical_exit(); /* * If nothing else caught this, we'll just do an internal free. */ zone_free_item(zone, item, NULL, SKIP_NONE); } /* See uma.h */ void uma_zfree_arg(uma_zone_t zone, void *item, void *udata) { uma_cache_t cache; uma_cache_bucket_t bucket; int itemdomain, uz_flags; /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); CTR2(KTR_UMA, "uma_zfree_arg zone %s(%p)", zone->uz_name, zone); #ifdef UMA_ZALLOC_DEBUG KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, ("uma_zfree_arg: called with SMR zone.")); if (uma_zfree_debug(zone, item, udata) == EJUSTRETURN) return; #endif /* uma_zfree(..., NULL) does nothing, to match free(9). */ if (item == NULL) return; /* * We are accessing the per-cpu cache without a critical section to * fetch size and flags. This is acceptable, if we are preempted we * will simply read another cpu's line. */ cache = &zone->uz_cpu[curcpu]; uz_flags = cache_uz_flags(cache); if (UMA_ALWAYS_CTORDTOR || __predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0)) item_dtor(zone, item, cache_uz_size(cache), udata, SKIP_NONE); /* * The race here is acceptable. If we miss it we'll just have to wait * a little longer for the limits to be reset. */ if (__predict_false(uz_flags & UMA_ZFLAG_LIMIT)) { if (zone->uz_sleepers > 0) goto zfree_item; } /* * If possible, free to the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread * accessing the cache must not be preempted or yield during access, * and (2) the thread must not migrate CPUs without switching which * cache it accesses. We rely on a critical section to prevent * preemption and migration. We release the critical section in * order to acquire the zone mutex if we are unable to free to the * current cache; when we re-acquire the critical section, we must * detect and handle migration if it has occurred. */ itemdomain = 0; #ifdef NUMA if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) itemdomain = item_domain(item); #endif critical_enter(); do { cache = &zone->uz_cpu[curcpu]; /* * Try to free into the allocbucket first to give LIFO * ordering for cache-hot datastructures. Spill over * into the freebucket if necessary. Alloc will swap * them if one runs dry. */ bucket = &cache->uc_allocbucket; #ifdef NUMA if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && PCPU_GET(domain) != itemdomain) { bucket = &cache->uc_crossbucket; } else #endif if (bucket->ucb_cnt == bucket->ucb_entries && cache->uc_freebucket.ucb_cnt < cache->uc_freebucket.ucb_entries) cache_bucket_swap(&cache->uc_freebucket, &cache->uc_allocbucket); if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) { cache_bucket_push(cache, bucket, item); critical_exit(); return; } } while (cache_free(zone, cache, udata, item, itemdomain)); critical_exit(); /* * If nothing else caught this, we'll just do an internal free. */ zfree_item: zone_free_item(zone, item, udata, SKIP_DTOR); } #ifdef NUMA /* * sort crossdomain free buckets to domain correct buckets and cache * them. */ static void zone_free_cross(uma_zone_t zone, uma_bucket_t bucket, void *udata) { struct uma_bucketlist fullbuckets; uma_zone_domain_t zdom; uma_bucket_t b; smr_seq_t seq; void *item; int domain; CTR3(KTR_UMA, "uma_zfree: zone %s(%p) draining cross bucket %p", zone->uz_name, zone, bucket); /* * It is possible for buckets to arrive here out of order so we fetch * the current smr seq rather than accepting the bucket's. */ seq = SMR_SEQ_INVALID; if ((zone->uz_flags & UMA_ZONE_SMR) != 0) seq = smr_advance(zone->uz_smr); /* * To avoid having ndomain * ndomain buckets for sorting we have a * lock on the current crossfree bucket. A full matrix with * per-domain locking could be used if necessary. */ STAILQ_INIT(&fullbuckets); ZONE_CROSS_LOCK(zone); while (bucket->ub_cnt > 0) { item = bucket->ub_bucket[bucket->ub_cnt - 1]; domain = item_domain(item); zdom = ZDOM_GET(zone, domain); if (zdom->uzd_cross == NULL) { zdom->uzd_cross = bucket_alloc(zone, udata, M_NOWAIT); if (zdom->uzd_cross == NULL) break; } b = zdom->uzd_cross; b->ub_bucket[b->ub_cnt++] = item; b->ub_seq = seq; if (b->ub_cnt == b->ub_entries) { STAILQ_INSERT_HEAD(&fullbuckets, b, ub_link); zdom->uzd_cross = NULL; } bucket->ub_cnt--; } ZONE_CROSS_UNLOCK(zone); if (bucket->ub_cnt == 0) bucket->ub_seq = SMR_SEQ_INVALID; bucket_free(zone, bucket, udata); while ((b = STAILQ_FIRST(&fullbuckets)) != NULL) { STAILQ_REMOVE_HEAD(&fullbuckets, ub_link); domain = item_domain(b->ub_bucket[0]); zone_put_bucket(zone, domain, b, udata, true); } } #endif static void zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata, int itemdomain, bool ws) { #ifdef NUMA /* * Buckets coming from the wrong domain will be entirely for the * only other domain on two domain systems. In this case we can * simply cache them. Otherwise we need to sort them back to * correct domains. */ if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && vm_ndomains > 2 && PCPU_GET(domain) != itemdomain) { zone_free_cross(zone, bucket, udata); return; } #endif /* * Attempt to save the bucket in the zone's domain bucket cache. */ CTR3(KTR_UMA, "uma_zfree: zone %s(%p) putting bucket %p on free list", zone->uz_name, zone, bucket); /* ub_cnt is pointing to the last free item */ if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0) itemdomain = zone_domain_lowest(zone, itemdomain); zone_put_bucket(zone, itemdomain, bucket, udata, ws); } /* * Populate a free or cross bucket for the current cpu cache. Free any * existing full bucket either to the zone cache or back to the slab layer. * * Enters and returns in a critical section. false return indicates that * we can not satisfy this free in the cache layer. true indicates that * the caller should retry. */ static __noinline bool cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, void *item, int itemdomain) { uma_cache_bucket_t cbucket; uma_bucket_t newbucket, bucket; CRITICAL_ASSERT(curthread); if (zone->uz_bucket_size == 0) return false; cache = &zone->uz_cpu[curcpu]; newbucket = NULL; /* * FIRSTTOUCH domains need to free to the correct zdom. When * enabled this is the zdom of the item. The bucket is the * cross bucket if the current domain and itemdomain do not match. */ cbucket = &cache->uc_freebucket; #ifdef NUMA if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) { if (PCPU_GET(domain) != itemdomain) { cbucket = &cache->uc_crossbucket; if (cbucket->ucb_cnt != 0) counter_u64_add(zone->uz_xdomain, cbucket->ucb_cnt); } } #endif bucket = cache_bucket_unload(cbucket); KASSERT(bucket == NULL || bucket->ub_cnt == bucket->ub_entries, ("cache_free: Entered with non-full free bucket.")); /* We are no longer associated with this CPU. */ critical_exit(); /* * Don't let SMR zones operate without a free bucket. Force * a synchronize and re-use this one. We will only degrade * to a synchronize every bucket_size items rather than every * item if we fail to allocate a bucket. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0) { if (bucket != NULL) bucket->ub_seq = smr_advance(zone->uz_smr); newbucket = bucket_alloc(zone, udata, M_NOWAIT); if (newbucket == NULL && bucket != NULL) { bucket_drain(zone, bucket); newbucket = bucket; bucket = NULL; } } else if (!bucketdisable) newbucket = bucket_alloc(zone, udata, M_NOWAIT); if (bucket != NULL) zone_free_bucket(zone, bucket, udata, itemdomain, true); critical_enter(); if ((bucket = newbucket) == NULL) return (false); cache = &zone->uz_cpu[curcpu]; #ifdef NUMA /* * Check to see if we should be populating the cross bucket. If it * is already populated we will fall through and attempt to populate * the free bucket. */ if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) { if (PCPU_GET(domain) != itemdomain && cache->uc_crossbucket.ucb_bucket == NULL) { cache_bucket_load_cross(cache, bucket); return (true); } } #endif /* * We may have lost the race to fill the bucket or switched CPUs. */ if (cache->uc_freebucket.ucb_bucket != NULL) { critical_exit(); bucket_free(zone, bucket, udata); critical_enter(); } else cache_bucket_load_free(cache, bucket); return (true); } static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item) { uma_keg_t keg; uma_domain_t dom; int freei; keg = zone->uz_keg; KEG_LOCK_ASSERT(keg, slab->us_domain); /* Do we need to remove from any lists? */ dom = &keg->uk_domain[slab->us_domain]; if (slab->us_freecount + 1 == keg->uk_ipers) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); dom->ud_free_slabs++; } else if (slab->us_freecount == 0) { LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); } /* Slab management. */ freei = slab_item_index(slab, keg, item); BIT_SET(keg->uk_ipers, freei, &slab->us_free); slab->us_freecount++; /* Keg statistics. */ dom->ud_free_items++; } static void zone_release(void *arg, void **bucket, int cnt) { struct mtx *lock; uma_zone_t zone; uma_slab_t slab; uma_keg_t keg; uint8_t *mem; void *item; int i; zone = arg; keg = zone->uz_keg; lock = NULL; if (__predict_false((zone->uz_flags & UMA_ZFLAG_HASH) != 0)) lock = KEG_LOCK(keg, 0); for (i = 0; i < cnt; i++) { item = bucket[i]; if (__predict_true((zone->uz_flags & UMA_ZFLAG_VTOSLAB) != 0)) { slab = vtoslab((vm_offset_t)item); } else { mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); if ((zone->uz_flags & UMA_ZFLAG_HASH) != 0) slab = hash_sfind(&keg->uk_hash, mem); else slab = (uma_slab_t)(mem + keg->uk_pgoff); } if (lock != KEG_LOCKPTR(keg, slab->us_domain)) { if (lock != NULL) mtx_unlock(lock); lock = KEG_LOCK(keg, slab->us_domain); } slab_free_item(zone, slab, item); } if (lock != NULL) mtx_unlock(lock); } /* * Frees a single item to any zone. * * Arguments: * zone The zone to free to * item The item we're freeing * udata User supplied data for the dtor * skip Skip dtors and finis */ static __noinline void zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip) { /* * If a free is sent directly to an SMR zone we have to * synchronize immediately because the item can instantly * be reallocated. This should only happen in degenerate * cases when no memory is available for per-cpu caches. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && skip == SKIP_NONE) smr_synchronize(zone->uz_smr); item_dtor(zone, item, zone->uz_size, udata, skip); if (skip < SKIP_FINI && zone->uz_fini) zone->uz_fini(item, zone->uz_size); zone->uz_release(zone->uz_arg, &item, 1); if (skip & SKIP_CNT) return; counter_u64_add(zone->uz_frees, 1); if (zone->uz_max_items > 0) zone_free_limit(zone, 1); } /* See uma.h */ int uma_zone_set_max(uma_zone_t zone, int nitems) { struct uma_bucket_zone *ubz; int count; /* * XXX This can misbehave if the zone has any allocations with * no limit and a limit is imposed. There is currently no * way to clear a limit. */ ZONE_LOCK(zone); ubz = bucket_zone_max(zone, nitems); count = ubz != NULL ? ubz->ubz_entries : 0; zone->uz_bucket_size_max = zone->uz_bucket_size = count; if (zone->uz_bucket_size_min > zone->uz_bucket_size_max) zone->uz_bucket_size_min = zone->uz_bucket_size_max; zone->uz_max_items = nitems; zone->uz_flags |= UMA_ZFLAG_LIMIT; zone_update_caches(zone); /* We may need to wake waiters. */ wakeup(&zone->uz_max_items); ZONE_UNLOCK(zone); return (nitems); } /* See uma.h */ void uma_zone_set_maxcache(uma_zone_t zone, int nitems) { struct uma_bucket_zone *ubz; int bpcpu; ZONE_LOCK(zone); ubz = bucket_zone_max(zone, nitems); if (ubz != NULL) { bpcpu = 2; if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) /* Count the cross-domain bucket. */ bpcpu++; nitems -= ubz->ubz_entries * bpcpu * mp_ncpus; zone->uz_bucket_size_max = ubz->ubz_entries; } else { zone->uz_bucket_size_max = zone->uz_bucket_size = 0; } if (zone->uz_bucket_size_min > zone->uz_bucket_size_max) zone->uz_bucket_size_min = zone->uz_bucket_size_max; zone->uz_bucket_max = nitems / vm_ndomains; ZONE_UNLOCK(zone); } /* See uma.h */ int uma_zone_get_max(uma_zone_t zone) { int nitems; nitems = atomic_load_64(&zone->uz_max_items); return (nitems); } /* See uma.h */ void uma_zone_set_warning(uma_zone_t zone, const char *warning) { ZONE_ASSERT_COLD(zone); zone->uz_warning = warning; } /* See uma.h */ void uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction) { ZONE_ASSERT_COLD(zone); TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone); } /* See uma.h */ int uma_zone_get_cur(uma_zone_t zone) { int64_t nitems; u_int i; nitems = 0; if (zone->uz_allocs != EARLY_COUNTER && zone->uz_frees != EARLY_COUNTER) nitems = counter_u64_fetch(zone->uz_allocs) - counter_u64_fetch(zone->uz_frees); CPU_FOREACH(i) nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs) - atomic_load_64(&zone->uz_cpu[i].uc_frees); return (nitems < 0 ? 0 : nitems); } static uint64_t uma_zone_get_allocs(uma_zone_t zone) { uint64_t nitems; u_int i; nitems = 0; if (zone->uz_allocs != EARLY_COUNTER) nitems = counter_u64_fetch(zone->uz_allocs); CPU_FOREACH(i) nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs); return (nitems); } static uint64_t uma_zone_get_frees(uma_zone_t zone) { uint64_t nitems; u_int i; nitems = 0; if (zone->uz_frees != EARLY_COUNTER) nitems = counter_u64_fetch(zone->uz_frees); CPU_FOREACH(i) nitems += atomic_load_64(&zone->uz_cpu[i].uc_frees); return (nitems); } #ifdef INVARIANTS /* Used only for KEG_ASSERT_COLD(). */ static uint64_t uma_keg_get_allocs(uma_keg_t keg) { uma_zone_t z; uint64_t nitems; nitems = 0; LIST_FOREACH(z, &keg->uk_zones, uz_link) nitems += uma_zone_get_allocs(z); return (nitems); } #endif /* See uma.h */ void uma_zone_set_init(uma_zone_t zone, uma_init uminit) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_init = uminit; } /* See uma.h */ void uma_zone_set_fini(uma_zone_t zone, uma_fini fini) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_fini = fini; } /* See uma.h */ void uma_zone_set_zinit(uma_zone_t zone, uma_init zinit) { ZONE_ASSERT_COLD(zone); zone->uz_init = zinit; } /* See uma.h */ void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini) { ZONE_ASSERT_COLD(zone); zone->uz_fini = zfini; } /* See uma.h */ void uma_zone_set_freef(uma_zone_t zone, uma_free freef) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_freef = freef; } /* See uma.h */ void uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_allocf = allocf; } /* See uma.h */ void uma_zone_set_smr(uma_zone_t zone, smr_t smr) { ZONE_ASSERT_COLD(zone); KASSERT(smr != NULL, ("Got NULL smr")); KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, ("zone %p (%s) already uses SMR", zone, zone->uz_name)); zone->uz_flags |= UMA_ZONE_SMR; zone->uz_smr = smr; zone_update_caches(zone); } smr_t uma_zone_get_smr(uma_zone_t zone) { return (zone->uz_smr); } /* See uma.h */ void uma_zone_reserve(uma_zone_t zone, int items) { uma_keg_t keg; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); keg->uk_reserve = items; } /* See uma.h */ int uma_zone_reserve_kva(uma_zone_t zone, int count) { uma_keg_t keg; vm_offset_t kva; u_int pages; KEG_GET(zone, keg); KEG_ASSERT_COLD(keg); ZONE_ASSERT_COLD(zone); pages = howmany(count, keg->uk_ipers) * keg->uk_ppera; #ifdef UMA_MD_SMALL_ALLOC if (keg->uk_ppera > 1) { #else if (1) { #endif kva = kva_alloc((vm_size_t)pages * PAGE_SIZE); if (kva == 0) return (0); } else kva = 0; MPASS(keg->uk_kva == 0); keg->uk_kva = kva; keg->uk_offset = 0; zone->uz_max_items = pages * keg->uk_ipers; #ifdef UMA_MD_SMALL_ALLOC keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc; #else keg->uk_allocf = noobj_alloc; #endif keg->uk_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE; zone->uz_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE; zone_update_caches(zone); return (1); } /* See uma.h */ void uma_prealloc(uma_zone_t zone, int items) { struct vm_domainset_iter di; uma_domain_t dom; uma_slab_t slab; uma_keg_t keg; int aflags, domain, slabs; KEG_GET(zone, keg); slabs = howmany(items, keg->uk_ipers); while (slabs-- > 0) { aflags = M_NOWAIT; vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, &aflags); for (;;) { slab = keg_alloc_slab(keg, zone, domain, M_WAITOK, aflags); if (slab != NULL) { dom = &keg->uk_domain[slab->us_domain]; /* * keg_alloc_slab() always returns a slab on the * partial list. */ LIST_REMOVE(slab, us_link); LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); dom->ud_free_slabs++; KEG_UNLOCK(keg, slab->us_domain); break; } if (vm_domainset_iter_policy(&di, &domain) != 0) vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0); } } } /* * Returns a snapshot of memory consumption in bytes. */ size_t uma_zone_memory(uma_zone_t zone) { size_t sz; int i; sz = 0; if (zone->uz_flags & UMA_ZFLAG_CACHE) { for (i = 0; i < vm_ndomains; i++) sz += ZDOM_GET(zone, i)->uzd_nitems; return (sz * zone->uz_size); } for (i = 0; i < vm_ndomains; i++) sz += zone->uz_keg->uk_domain[i].ud_pages; return (sz * PAGE_SIZE); } /* See uma.h */ void uma_reclaim(int req) { CTR0(KTR_UMA, "UMA: vm asked us to release pages!"); sx_xlock(&uma_reclaim_lock); bucket_enable(); switch (req) { case UMA_RECLAIM_TRIM: zone_foreach(zone_trim, NULL); break; case UMA_RECLAIM_DRAIN: case UMA_RECLAIM_DRAIN_CPU: zone_foreach(zone_drain, NULL); if (req == UMA_RECLAIM_DRAIN_CPU) { pcpu_cache_drain_safe(NULL); zone_foreach(zone_drain, NULL); } break; default: panic("unhandled reclamation request %d", req); } /* * Some slabs may have been freed but this zone will be visited early * we visit again so that we can free pages that are empty once other * zones are drained. We have to do the same for buckets. */ zone_drain(slabzones[0], NULL); zone_drain(slabzones[1], NULL); bucket_zone_drain(); sx_xunlock(&uma_reclaim_lock); } static volatile int uma_reclaim_needed; void uma_reclaim_wakeup(void) { if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0) wakeup(uma_reclaim); } void uma_reclaim_worker(void *arg __unused) { for (;;) { sx_xlock(&uma_reclaim_lock); while (atomic_load_int(&uma_reclaim_needed) == 0) sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl", hz); sx_xunlock(&uma_reclaim_lock); EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM); uma_reclaim(UMA_RECLAIM_DRAIN_CPU); atomic_store_int(&uma_reclaim_needed, 0); /* Don't fire more than once per-second. */ pause("umarclslp", hz); } } /* See uma.h */ void uma_zone_reclaim(uma_zone_t zone, int req) { switch (req) { case UMA_RECLAIM_TRIM: zone_trim(zone, NULL); break; case UMA_RECLAIM_DRAIN: zone_drain(zone, NULL); break; case UMA_RECLAIM_DRAIN_CPU: pcpu_cache_drain_safe(zone); zone_drain(zone, NULL); break; default: panic("unhandled reclamation request %d", req); } } /* See uma.h */ int uma_zone_exhausted(uma_zone_t zone) { return (atomic_load_32(&zone->uz_sleepers) > 0); } unsigned long uma_limit(void) { return (uma_kmem_limit); } void uma_set_limit(unsigned long limit) { uma_kmem_limit = limit; } unsigned long uma_size(void) { return (atomic_load_long(&uma_kmem_total)); } long uma_avail(void) { return (uma_kmem_limit - uma_size()); } #ifdef DDB /* * Generate statistics across both the zone and its per-cpu cache's. Return * desired statistics if the pointer is non-NULL for that statistic. * * Note: does not update the zone statistics, as it can't safely clear the * per-CPU cache statistic. * */ static void uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp, uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp) { uma_cache_t cache; uint64_t allocs, frees, sleeps, xdomain; int cachefree, cpu; allocs = frees = sleeps = xdomain = 0; cachefree = 0; CPU_FOREACH(cpu) { cache = &z->uz_cpu[cpu]; cachefree += cache->uc_allocbucket.ucb_cnt; cachefree += cache->uc_freebucket.ucb_cnt; xdomain += cache->uc_crossbucket.ucb_cnt; cachefree += cache->uc_crossbucket.ucb_cnt; allocs += cache->uc_allocs; frees += cache->uc_frees; } allocs += counter_u64_fetch(z->uz_allocs); frees += counter_u64_fetch(z->uz_frees); xdomain += counter_u64_fetch(z->uz_xdomain); sleeps += z->uz_sleeps; if (cachefreep != NULL) *cachefreep = cachefree; if (allocsp != NULL) *allocsp = allocs; if (freesp != NULL) *freesp = frees; if (sleepsp != NULL) *sleepsp = sleeps; if (xdomainp != NULL) *xdomainp = xdomain; } #endif /* DDB */ static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS) { uma_keg_t kz; uma_zone_t z; int count; count = 0; rw_rlock(&uma_rwlock); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } LIST_FOREACH(z, &uma_cachezones, uz_link) count++; rw_runlock(&uma_rwlock); return (sysctl_handle_int(oidp, &count, 0, req)); } static void uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf, struct uma_percpu_stat *ups, bool internal) { uma_zone_domain_t zdom; uma_cache_t cache; int i; for (i = 0; i < vm_ndomains; i++) { zdom = ZDOM_GET(z, i); uth->uth_zone_free += zdom->uzd_nitems; } uth->uth_allocs = counter_u64_fetch(z->uz_allocs); uth->uth_frees = counter_u64_fetch(z->uz_frees); uth->uth_fails = counter_u64_fetch(z->uz_fails); uth->uth_xdomain = counter_u64_fetch(z->uz_xdomain); uth->uth_sleeps = z->uz_sleeps; for (i = 0; i < mp_maxid + 1; i++) { bzero(&ups[i], sizeof(*ups)); if (internal || CPU_ABSENT(i)) continue; cache = &z->uz_cpu[i]; ups[i].ups_cache_free += cache->uc_allocbucket.ucb_cnt; ups[i].ups_cache_free += cache->uc_freebucket.ucb_cnt; ups[i].ups_cache_free += cache->uc_crossbucket.ucb_cnt; ups[i].ups_allocs = cache->uc_allocs; ups[i].ups_frees = cache->uc_frees; } } static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) { struct uma_stream_header ush; struct uma_type_header uth; struct uma_percpu_stat *ups; struct sbuf sbuf; uma_keg_t kz; uma_zone_t z; uint64_t items; uint32_t kfree, pages; int count, error, i; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL); ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK); count = 0; rw_rlock(&uma_rwlock); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } LIST_FOREACH(z, &uma_cachezones, uz_link) count++; /* * Insert stream header. */ bzero(&ush, sizeof(ush)); ush.ush_version = UMA_STREAM_VERSION; ush.ush_maxcpus = (mp_maxid + 1); ush.ush_count = count; (void)sbuf_bcat(&sbuf, &ush, sizeof(ush)); LIST_FOREACH(kz, &uma_kegs, uk_link) { kfree = pages = 0; for (i = 0; i < vm_ndomains; i++) { kfree += kz->uk_domain[i].ud_free_items; pages += kz->uk_domain[i].ud_pages; } LIST_FOREACH(z, &kz->uk_zones, uz_link) { bzero(&uth, sizeof(uth)); strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); uth.uth_align = kz->uk_align; uth.uth_size = kz->uk_size; uth.uth_rsize = kz->uk_rsize; if (z->uz_max_items > 0) { items = UZ_ITEMS_COUNT(z->uz_items); uth.uth_pages = (items / kz->uk_ipers) * kz->uk_ppera; } else uth.uth_pages = pages; uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) * kz->uk_ppera; uth.uth_limit = z->uz_max_items; uth.uth_keg_free = kfree; /* * A zone is secondary is it is not the first entry * on the keg's zone list. */ if ((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z)) uth.uth_zone_flags = UTH_ZONE_SECONDARY; uma_vm_zone_stats(&uth, z, &sbuf, ups, kz->uk_flags & UMA_ZFLAG_INTERNAL); (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); for (i = 0; i < mp_maxid + 1; i++) (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); } } LIST_FOREACH(z, &uma_cachezones, uz_link) { bzero(&uth, sizeof(uth)); strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); uth.uth_size = z->uz_size; uma_vm_zone_stats(&uth, z, &sbuf, ups, false); (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); for (i = 0; i < mp_maxid + 1; i++) (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); } rw_runlock(&uma_rwlock); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); free(ups, M_TEMP); return (error); } int sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = *(uma_zone_t *)arg1; int error, max; max = uma_zone_get_max(zone); error = sysctl_handle_int(oidp, &max, 0, req); if (error || !req->newptr) return (error); uma_zone_set_max(zone, max); return (0); } int sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS) { uma_zone_t zone; int cur; /* * Some callers want to add sysctls for global zones that * may not yet exist so they pass a pointer to a pointer. */ if (arg2 == 0) zone = *(uma_zone_t *)arg1; else zone = arg1; cur = uma_zone_get_cur(zone); return (sysctl_handle_int(oidp, &cur, 0, req)); } static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = arg1; uint64_t cur; cur = uma_zone_get_allocs(zone); return (sysctl_handle_64(oidp, &cur, 0, req)); } static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = arg1; uint64_t cur; cur = uma_zone_get_frees(zone); return (sysctl_handle_64(oidp, &cur, 0, req)); } static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; uma_zone_t zone = arg1; int error; sbuf_new_for_sysctl(&sbuf, NULL, 0, req); if (zone->uz_flags != 0) sbuf_printf(&sbuf, "0x%b", zone->uz_flags, PRINT_UMA_ZFLAGS); else sbuf_printf(&sbuf, "0"); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS) { uma_keg_t keg = arg1; int avail, effpct, total; total = keg->uk_ppera * PAGE_SIZE; if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0) total += slabzone(keg->uk_ipers)->uz_keg->uk_rsize; /* * We consider the client's requested size and alignment here, not the * real size determination uk_rsize, because we also adjust the real * size for internal implementation reasons (max bitset size). */ avail = keg->uk_ipers * roundup2(keg->uk_size, keg->uk_align + 1); if ((keg->uk_flags & UMA_ZONE_PCPU) != 0) avail *= mp_maxid + 1; effpct = 100 * avail / total; return (sysctl_handle_int(oidp, &effpct, 0, req)); } static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS) { uma_zone_t zone = arg1; uint64_t cur; cur = UZ_ITEMS_COUNT(atomic_load_64(&zone->uz_items)); return (sysctl_handle_64(oidp, &cur, 0, req)); } #ifdef INVARIANTS static uma_slab_t uma_dbg_getslab(uma_zone_t zone, void *item) { uma_slab_t slab; uma_keg_t keg; uint8_t *mem; /* * It is safe to return the slab here even though the * zone is unlocked because the item's allocation state * essentially holds a reference. */ mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) return (NULL); if (zone->uz_flags & UMA_ZFLAG_VTOSLAB) return (vtoslab((vm_offset_t)mem)); keg = zone->uz_keg; if ((keg->uk_flags & UMA_ZFLAG_HASH) == 0) return ((uma_slab_t)(mem + keg->uk_pgoff)); KEG_LOCK(keg, 0); slab = hash_sfind(&keg->uk_hash, mem); KEG_UNLOCK(keg, 0); return (slab); } static bool uma_dbg_zskip(uma_zone_t zone, void *mem) { if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) return (true); return (uma_dbg_kskip(zone->uz_keg, mem)); } static bool uma_dbg_kskip(uma_keg_t keg, void *mem) { uintptr_t idx; if (dbg_divisor == 0) return (true); if (dbg_divisor == 1) return (false); idx = (uintptr_t)mem >> PAGE_SHIFT; if (keg->uk_ipers > 1) { idx *= keg->uk_ipers; idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize; } if ((idx / dbg_divisor) * dbg_divisor != idx) { counter_u64_add(uma_skip_cnt, 1); return (true); } counter_u64_add(uma_dbg_cnt, 1); return (false); } /* * Set up the slab's freei data such that uma_dbg_free can function. * */ static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item) { uma_keg_t keg; int freei; if (slab == NULL) { slab = uma_dbg_getslab(zone, item); if (slab == NULL) panic("uma: item %p did not belong to zone %s", item, zone->uz_name); } keg = zone->uz_keg; freei = slab_item_index(slab, keg, item); if (BIT_ISSET(keg->uk_ipers, freei, slab_dbg_bits(slab, keg))) panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)", item, zone, zone->uz_name, slab, freei); BIT_SET_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg)); } /* * Verifies freed addresses. Checks for alignment, valid slab membership * and duplicate frees. * */ static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) { uma_keg_t keg; int freei; if (slab == NULL) { slab = uma_dbg_getslab(zone, item); if (slab == NULL) panic("uma: Freed item %p did not belong to zone %s", item, zone->uz_name); } keg = zone->uz_keg; freei = slab_item_index(slab, keg, item); if (freei >= keg->uk_ipers) panic("Invalid free of %p from zone %p(%s) slab %p(%d)", item, zone, zone->uz_name, slab, freei); if (slab_item(slab, keg, freei) != item) panic("Unaligned free of %p from zone %p(%s) slab %p(%d)", item, zone, zone->uz_name, slab, freei); if (!BIT_ISSET(keg->uk_ipers, freei, slab_dbg_bits(slab, keg))) panic("Duplicate free of %p from zone %p(%s) slab %p(%d)", item, zone, zone->uz_name, slab, freei); BIT_CLR_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg)); } #endif /* INVARIANTS */ #ifdef DDB static int64_t get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used, uint64_t *sleeps, long *cachefree, uint64_t *xdomain) { uint64_t frees; int i; if (kz->uk_flags & UMA_ZFLAG_INTERNAL) { *allocs = counter_u64_fetch(z->uz_allocs); frees = counter_u64_fetch(z->uz_frees); *sleeps = z->uz_sleeps; *cachefree = 0; *xdomain = 0; } else uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps, xdomain); for (i = 0; i < vm_ndomains; i++) { *cachefree += ZDOM_GET(z, i)->uzd_nitems; if (!((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z))) *cachefree += kz->uk_domain[i].ud_free_items; } *used = *allocs - frees; return (((int64_t)*used + *cachefree) * kz->uk_size); } DB_SHOW_COMMAND(uma, db_show_uma) { const char *fmt_hdr, *fmt_entry; uma_keg_t kz; uma_zone_t z; uint64_t allocs, used, sleeps, xdomain; long cachefree; /* variables for sorting */ uma_keg_t cur_keg; uma_zone_t cur_zone, last_zone; int64_t cur_size, last_size, size; int ties; /* /i option produces machine-parseable CSV output */ if (modif[0] == 'i') { fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n"; fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n"; } else { fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n"; fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n"; } db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests", "Sleeps", "Bucket", "Total Mem", "XFree"); /* Sort the zones with largest size first. */ last_zone = NULL; last_size = INT64_MAX; for (;;) { cur_zone = NULL; cur_size = -1; ties = 0; LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) { /* * In the case of size ties, print out zones * in the order they are encountered. That is, * when we encounter the most recently output * zone, we have already printed all preceding * ties, and we must print all following ties. */ if (z == last_zone) { ties = 1; continue; } size = get_uma_stats(kz, z, &allocs, &used, &sleeps, &cachefree, &xdomain); if (size > cur_size && size < last_size + ties) { cur_size = size; cur_zone = z; cur_keg = kz; } } } if (cur_zone == NULL) break; size = get_uma_stats(cur_keg, cur_zone, &allocs, &used, &sleeps, &cachefree, &xdomain); db_printf(fmt_entry, cur_zone->uz_name, (uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree, (uintmax_t)allocs, (uintmax_t)sleeps, (unsigned)cur_zone->uz_bucket_size, (intmax_t)size, xdomain); if (db_pager_quit) return; last_zone = cur_zone; last_size = cur_size; } } DB_SHOW_COMMAND(umacache, db_show_umacache) { uma_zone_t z; uint64_t allocs, frees; long cachefree; int i; db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free", "Requests", "Bucket"); LIST_FOREACH(z, &uma_cachezones, uz_link) { uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL); for (i = 0; i < vm_ndomains; i++) cachefree += ZDOM_GET(z, i)->uzd_nitems; db_printf("%18s %8ju %8jd %8ld %12ju %8u\n", z->uz_name, (uintmax_t)z->uz_size, (intmax_t)(allocs - frees), cachefree, (uintmax_t)allocs, z->uz_bucket_size); if (db_pager_quit) return; } } #endif /* DDB */ Index: head/sys/vm/vm_dumpset.h =================================================================== --- head/sys/vm/vm_dumpset.h (nonexistent) +++ head/sys/vm/vm_dumpset.h (revision 366711) @@ -0,0 +1,99 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020, Scott Phillips + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_DUMPSET_H_ +#define _SYS_DUMPSET_H_ + +#include +#include + +extern struct bitset *vm_page_dump; +extern long vm_page_dump_pages; +extern vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; + +static inline void +dump_add_page(vm_paddr_t pa) +{ + vm_pindex_t adj; + int i; + + adj = 0; + for (i = 0; dump_avail[i + 1] != 0; i += 2) { + if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) { + BIT_SET_ATOMIC(vm_page_dump_pages, + (pa >> PAGE_SHIFT) - (dump_avail[i] >> PAGE_SHIFT) + + adj, vm_page_dump); + return; + } + adj += howmany(dump_avail[i + 1], PAGE_SIZE) - + dump_avail[i] / PAGE_SIZE; + } +} + +static inline void +dump_drop_page(vm_paddr_t pa) +{ + vm_pindex_t adj; + int i; + + adj = 0; + for (i = 0; dump_avail[i + 1] != 0; i += 2) { + if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) { + BIT_CLR_ATOMIC(vm_page_dump_pages, + (pa >> PAGE_SHIFT) - (dump_avail[i] >> PAGE_SHIFT) + + adj, vm_page_dump); + return; + } + adj += howmany(dump_avail[i + 1], PAGE_SIZE) - + dump_avail[i] / PAGE_SIZE; + } +} + +static inline vm_paddr_t +vm_page_dump_index_to_pa(int bit) +{ + int i, tot; + + for (i = 0; dump_avail[i + 1] != 0; i += 2) { + tot = howmany(dump_avail[i + 1], PAGE_SIZE) - + dump_avail[i] / PAGE_SIZE; + if (bit < tot) + return ((vm_paddr_t)bit * PAGE_SIZE + + (dump_avail[i] & ~PAGE_MASK)); + bit -= tot; + } + return ((vm_paddr_t)NULL); +} + +#define VM_PAGE_DUMP_FOREACH(pa) \ + for (vm_pindex_t __b = BIT_FFS(vm_page_dump_pages, vm_page_dump); \ + (pa) = vm_page_dump_index_to_pa(__b - 1), __b != 0; \ + __b = BIT_FFS_AT(vm_page_dump_pages, vm_page_dump, __b)) + +#endif /* _SYS_DUMPSET_H_ */ Property changes on: head/sys/vm/vm_dumpset.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/sys/vm/vm_page.c =================================================================== --- head/sys/vm/vm_page.c (revision 366710) +++ head/sys/vm/vm_page.c (revision 366711) @@ -1,5529 +1,5530 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /*- * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Resident memory management module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include struct vm_domain vm_dom[MAXMEMDOM]; DPCPU_DEFINE_STATIC(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]); struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT]; struct mtx_padalign __exclusive_cache_line vm_domainset_lock; /* The following fields are protected by the domainset lock. */ domainset_t __exclusive_cache_line vm_min_domains; domainset_t __exclusive_cache_line vm_severe_domains; static int vm_min_waiters; static int vm_severe_waiters; static int vm_pageproc_waiters; static SYSCTL_NODE(_vm_stats, OID_AUTO, page, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM page statistics"); static COUNTER_U64_DEFINE_EARLY(pqstate_commit_retries); SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, pqstate_commit_retries, CTLFLAG_RD, &pqstate_commit_retries, "Number of failed per-page atomic queue state updates"); static COUNTER_U64_DEFINE_EARLY(queue_ops); SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_ops, CTLFLAG_RD, &queue_ops, "Number of batched queue operations"); static COUNTER_U64_DEFINE_EARLY(queue_nops); SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_nops, CTLFLAG_RD, &queue_nops, "Number of batched queue operations with no effects"); /* * bogus page -- for I/O to/from partially complete buffers, * or for paging into sparsely invalid regions. */ vm_page_t bogus_page; vm_page_t vm_page_array; long vm_page_array_size; long first_page; struct bitset *vm_page_dump; long vm_page_dump_pages; static TAILQ_HEAD(, vm_page) blacklist_head; static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); static uma_zone_t fakepg_zone; static void vm_page_alloc_check(vm_page_t m); static bool _vm_page_busy_sleep(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); static void vm_page_enqueue(vm_page_t m, uint8_t queue); static bool vm_page_free_prep(vm_page_t m); static void vm_page_free_toq(vm_page_t m); static void vm_page_init(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); static void vm_page_mvqueue(vm_page_t m, const uint8_t queue, const uint16_t nflag); static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high); static void vm_page_release_toq(vm_page_t m, uint8_t nqueue, bool noreuse); static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req); static int vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags); static void vm_page_zone_release(void *arg, void **store, int cnt); SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL); static void vm_page_init(void *dummy) { fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); } /* * The cache page zone is initialized later since we need to be able to allocate * pages before UMA is fully initialized. */ static void vm_page_init_cache_zones(void *dummy __unused) { struct vm_domain *vmd; struct vm_pgcache *pgcache; int cache, domain, maxcache, pool; maxcache = 0; TUNABLE_INT_FETCH("vm.pgcache_zone_max_pcpu", &maxcache); maxcache *= mp_ncpus; for (domain = 0; domain < vm_ndomains; domain++) { vmd = VM_DOMAIN(domain); for (pool = 0; pool < VM_NFREEPOOL; pool++) { pgcache = &vmd->vmd_pgcache[pool]; pgcache->domain = domain; pgcache->pool = pool; pgcache->zone = uma_zcache_create("vm pgcache", PAGE_SIZE, NULL, NULL, NULL, NULL, vm_page_zone_import, vm_page_zone_release, pgcache, UMA_ZONE_VM); /* * Limit each pool's zone to 0.1% of the pages in the * domain. */ cache = maxcache != 0 ? maxcache : vmd->vmd_page_count / 1000; uma_zone_set_maxcache(pgcache->zone, cache); } } } SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL); /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ #if PAGE_SIZE == 32768 #ifdef CTASSERT CTASSERT(sizeof(u_long) >= 8); #endif #endif /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. */ void vm_set_page_size(void) { if (vm_cnt.v_page_size == 0) vm_cnt.v_page_size = PAGE_SIZE; if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); } /* * vm_page_blacklist_next: * * Find the next entry in the provided string of blacklist * addresses. Entries are separated by space, comma, or newline. * If an invalid integer is encountered then the rest of the * string is skipped. Updates the list pointer to the next * character, or NULL if the string is exhausted or invalid. */ static vm_paddr_t vm_page_blacklist_next(char **list, char *end) { vm_paddr_t bad; char *cp, *pos; if (list == NULL || *list == NULL) return (0); if (**list =='\0') { *list = NULL; return (0); } /* * If there's no end pointer then the buffer is coming from * the kenv and we know it's null-terminated. */ if (end == NULL) end = *list + strlen(*list); /* Ensure that strtoq() won't walk off the end */ if (*end != '\0') { if (*end == '\n' || *end == ' ' || *end == ',') *end = '\0'; else { printf("Blacklist not terminated, skipping\n"); *list = NULL; return (0); } } for (pos = *list; *pos != '\0'; pos = cp) { bad = strtoq(pos, &cp, 0); if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { if (bad == 0) { if (++cp < end) continue; else break; } } else break; if (*cp == '\0' || ++cp >= end) *list = NULL; else *list = cp; return (trunc_page(bad)); } printf("Garbage in RAM blacklist, skipping\n"); *list = NULL; return (0); } bool vm_page_blacklist_add(vm_paddr_t pa, bool verbose) { struct vm_domain *vmd; vm_page_t m; int ret; m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) return (true); /* page does not exist, no failure */ vmd = vm_pagequeue_domain(m); vm_domain_free_lock(vmd); ret = vm_phys_unfree_page(m); vm_domain_free_unlock(vmd); if (ret != 0) { vm_domain_freecnt_inc(vmd, -1); TAILQ_INSERT_TAIL(&blacklist_head, m, listq); if (verbose) printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); } return (ret); } /* * vm_page_blacklist_check: * * Iterate through the provided string of blacklist addresses, pulling * each entry out of the physical allocator free list and putting it * onto a list for reporting via the vm.page_blacklist sysctl. */ static void vm_page_blacklist_check(char *list, char *end) { vm_paddr_t pa; char *next; next = list; while (next != NULL) { if ((pa = vm_page_blacklist_next(&next, end)) == 0) continue; vm_page_blacklist_add(pa, bootverbose); } } /* * vm_page_blacklist_load: * * Search for a special module named "ram_blacklist". It'll be a * plain text file provided by the user via the loader directive * of the same name. */ static void vm_page_blacklist_load(char **list, char **end) { void *mod; u_char *ptr; u_int len; mod = NULL; ptr = NULL; mod = preload_search_by_type("ram_blacklist"); if (mod != NULL) { ptr = preload_fetch_addr(mod); len = preload_fetch_size(mod); } *list = ptr; if (ptr != NULL) *end = ptr + len; else *end = NULL; return; } static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) { vm_page_t m; struct sbuf sbuf; int error, first; first = 1; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); TAILQ_FOREACH(m, &blacklist_head, listq) { sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", (uintmax_t)m->phys_addr); first = 0; } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Initialize a dummy page for use in scans of the specified paging queue. * In principle, this function only needs to set the flag PG_MARKER. * Nonetheless, it write busies the page as a safety precaution. */ void vm_page_init_marker(vm_page_t marker, int queue, uint16_t aflags) { bzero(marker, sizeof(*marker)); marker->flags = PG_MARKER; marker->a.flags = aflags; marker->busy_lock = VPB_CURTHREAD_EXCLUSIVE; marker->a.queue = queue; } static void vm_page_domain_init(int domain) { struct vm_domain *vmd; struct vm_pagequeue *pq; int i; vmd = VM_DOMAIN(domain); bzero(vmd, sizeof(*vmd)); *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = "vm inactive pagequeue"; *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = "vm active pagequeue"; *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = "vm laundry pagequeue"; *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) = "vm unswappable pagequeue"; vmd->vmd_domain = domain; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; vmd->vmd_oom = FALSE; for (i = 0; i < PQ_COUNT; i++) { pq = &vmd->vmd_pagequeues[i]; TAILQ_INIT(&pq->pq_pl); mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK); pq->pq_pdpages = 0; vm_page_init_marker(&vmd->vmd_markers[i], i, 0); } mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF); mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF); snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain); /* * inacthead is used to provide FIFO ordering for LRU-bypassing * insertions. */ vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE, PGA_ENQUEUED); TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl, &vmd->vmd_inacthead, plinks.q); /* * The clock pages are used to implement active queue scanning without * requeues. Scans start at clock[0], which is advanced after the scan * ends. When the two clock hands meet, they are reset and scanning * resumes from the head of the queue. */ vm_page_init_marker(&vmd->vmd_clock[0], PQ_ACTIVE, PGA_ENQUEUED); vm_page_init_marker(&vmd->vmd_clock[1], PQ_ACTIVE, PGA_ENQUEUED); TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, &vmd->vmd_clock[1], plinks.q); } /* * Initialize a physical page in preparation for adding it to the free * lists. */ static void vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind) { m->object = NULL; m->ref_count = 0; m->busy_lock = VPB_FREED; m->flags = m->a.flags = 0; m->phys_addr = pa; m->a.queue = PQ_NONE; m->psind = 0; m->segind = segind; m->order = VM_NFREEORDER; m->pool = VM_FREEPOOL_DEFAULT; m->valid = m->dirty = 0; pmap_page_init(m); } #ifndef PMAP_HAS_PAGE_ARRAY static vm_paddr_t vm_page_array_alloc(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t page_range) { vm_paddr_t new_end; /* * Reserve an unmapped guard page to trap access to vm_page_array[-1]. * However, because this page is allocated from KVM, out-of-bounds * accesses using the direct map will not be trapped. */ *vaddr += PAGE_SIZE; /* * Allocate physical memory for the page structures, and map it. */ new_end = trunc_page(end - page_range * sizeof(struct vm_page)); vm_page_array = (vm_page_t)pmap_map(vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); vm_page_array_size = page_range; return (new_end); } #endif /* * vm_page_startup: * * Initializes the resident memory module. Allocates physical memory for * bootstrapping UMA and some data structures that are used to manage * physical pages. Initializes these structures, and populates the free * page queues. */ vm_offset_t vm_page_startup(vm_offset_t vaddr) { struct vm_phys_seg *seg; vm_page_t m; char *list, *listend; vm_paddr_t end, high_avail, low_avail, new_end, size; vm_paddr_t page_range __unused; vm_paddr_t last_pa, pa; u_long pagecount; #if MINIDUMP_PAGE_TRACKING u_long vm_page_dump_size; #endif int biggestone, i, segind; #ifdef WITNESS vm_offset_t mapped; int witness_size; #endif #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) long ii; #endif vaddr = round_page(vaddr); vm_phys_early_startup(); biggestone = vm_phys_avail_largest(); end = phys_avail[biggestone+1]; /* * Initialize the page and queue locks. */ mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); for (i = 0; i < vm_ndomains; i++) vm_page_domain_init(i); new_end = end; #ifdef WITNESS witness_size = round_page(witness_startup_count()); new_end -= witness_size; mapped = pmap_map(&vaddr, new_end, new_end + witness_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)mapped, witness_size); witness_startup((void *)mapped); #endif #if MINIDUMP_PAGE_TRACKING /* * Allocate a bitmap to indicate that a random physical page * needs to be included in a minidump. * * The amd64 port needs this to indicate which direct map pages * need to be dumped, via calls to dump_add_page()/dump_drop_page(). * * However, i386 still needs this workspace internally within the * minidump code. In theory, they are not needed on i386, but are * included should the sf_buf code decide to use them. */ last_pa = 0; vm_page_dump_pages = 0; for (i = 0; dump_avail[i + 1] != 0; i += 2) { vm_page_dump_pages += howmany(dump_avail[i + 1], PAGE_SIZE) - dump_avail[i] / PAGE_SIZE; if (dump_avail[i + 1] > last_pa) last_pa = dump_avail[i + 1]; } vm_page_dump_size = round_page(BITSET_SIZE(vm_page_dump_pages)); new_end -= vm_page_dump_size; vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)vm_page_dump, vm_page_dump_size); #else (void)last_pa; #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \ defined(__riscv) || defined(__powerpc64__) /* * Include the UMA bootstrap pages, witness pages and vm_page_dump * in a crash dump. When pmap_map() uses the direct map, they are * not automatically included. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; #ifdef __amd64__ /* * Request that the physical pages underlying the message buffer be * included in a crash dump. Since the message buffer is accessed * through the direct map, they are not automatically included. */ pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); last_pa = pa + round_page(msgbufsize); while (pa < last_pa) { dump_add_page(pa); pa += PAGE_SIZE; } #endif /* * Compute the number of pages of memory that will be available for * use, taking into account the overhead of a page structure per page. * In other words, solve * "available physical memory" - round_page(page_range * * sizeof(struct vm_page)) = page_range * PAGE_SIZE * for page_range. */ low_avail = phys_avail[0]; high_avail = phys_avail[1]; for (i = 0; i < vm_phys_nsegs; i++) { if (vm_phys_segs[i].start < low_avail) low_avail = vm_phys_segs[i].start; if (vm_phys_segs[i].end > high_avail) high_avail = vm_phys_segs[i].end; } /* Skip the first chunk. It is already accounted for. */ for (i = 2; phys_avail[i + 1] != 0; i += 2) { if (phys_avail[i] < low_avail) low_avail = phys_avail[i]; if (phys_avail[i + 1] > high_avail) high_avail = phys_avail[i + 1]; } first_page = low_avail / PAGE_SIZE; #ifdef VM_PHYSSEG_SPARSE size = 0; for (i = 0; i < vm_phys_nsegs; i++) size += vm_phys_segs[i].end - vm_phys_segs[i].start; for (i = 0; phys_avail[i + 1] != 0; i += 2) size += phys_avail[i + 1] - phys_avail[i]; #elif defined(VM_PHYSSEG_DENSE) size = high_avail - low_avail; #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif #ifdef PMAP_HAS_PAGE_ARRAY pmap_page_array_startup(size / PAGE_SIZE); biggestone = vm_phys_avail_largest(); end = new_end = phys_avail[biggestone + 1]; #else #ifdef VM_PHYSSEG_DENSE /* * In the VM_PHYSSEG_DENSE case, the number of pages can account for * the overhead of a page structure per page only if vm_page_array is * allocated from the last physical memory chunk. Otherwise, we must * allocate page structures representing the physical memory * underlying vm_page_array, even though they will not be used. */ if (new_end != high_avail) page_range = size / PAGE_SIZE; else #endif { page_range = size / (PAGE_SIZE + sizeof(struct vm_page)); /* * If the partial bytes remaining are large enough for * a page (PAGE_SIZE) without a corresponding * 'struct vm_page', then new_end will contain an * extra page after subtracting the length of the VM * page array. Compensate by subtracting an extra * page from new_end. */ if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) { if (new_end == high_avail) high_avail -= PAGE_SIZE; new_end -= PAGE_SIZE; } } end = new_end; new_end = vm_page_array_alloc(&vaddr, end, page_range); #endif #if VM_NRESERVLEVEL > 0 /* * Allocate physical memory for the reservation management system's * data structures, and map it. */ new_end = vm_reserv_startup(&vaddr, new_end); #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \ defined(__riscv) || defined(__powerpc64__) /* * Include vm_page_array and vm_reserv_array in a crash dump. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; /* * Add physical memory segments corresponding to the available * physical pages. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) if (vm_phys_avail_size(i) != 0) vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]); /* * Initialize the physical memory allocator. */ vm_phys_init(); /* * Initialize the page structures and add every available page to the * physical memory allocator's free lists. */ #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) for (ii = 0; ii < vm_page_array_size; ii++) { m = &vm_page_array[ii]; vm_page_init_page(m, (first_page + ii) << PAGE_SHIFT, 0); m->flags = PG_FICTITIOUS; } #endif vm_cnt.v_page_count = 0; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; for (m = seg->first_page, pa = seg->start; pa < seg->end; m++, pa += PAGE_SIZE) vm_page_init_page(m, pa, segind); /* * Add the segment to the free lists only if it is covered by * one of the ranges in phys_avail. Because we've added the * ranges to the vm_phys_segs array, we can assume that each * segment is either entirely contained in one of the ranges, * or doesn't overlap any of them. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) { struct vm_domain *vmd; if (seg->start < phys_avail[i] || seg->end > phys_avail[i + 1]) continue; m = seg->first_page; pagecount = (u_long)atop(seg->end - seg->start); vmd = VM_DOMAIN(seg->domain); vm_domain_free_lock(vmd); vm_phys_enqueue_contig(m, pagecount); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, pagecount); vm_cnt.v_page_count += (u_int)pagecount; vmd = VM_DOMAIN(seg->domain); vmd->vmd_page_count += (u_int)pagecount; vmd->vmd_segs |= 1UL << m->segind; break; } } /* * Remove blacklisted pages from the physical memory allocator. */ TAILQ_INIT(&blacklist_head); vm_page_blacklist_load(&list, &listend); vm_page_blacklist_check(list, listend); list = kern_getenv("vm.blacklist"); vm_page_blacklist_check(list, NULL); freeenv(list); #if VM_NRESERVLEVEL > 0 /* * Initialize the reservation management system. */ vm_reserv_init(); #endif return (vaddr); } void vm_page_reference(vm_page_t m) { vm_page_aflag_set(m, PGA_REFERENCED); } /* * vm_page_trybusy * * Helper routine for grab functions to trylock busy. * * Returns true on success and false on failure. */ static bool vm_page_trybusy(vm_page_t m, int allocflags) { if ((allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0) return (vm_page_trysbusy(m)); else return (vm_page_tryxbusy(m)); } /* * vm_page_tryacquire * * Helper routine for grab functions to trylock busy and wire. * * Returns true on success and false on failure. */ static inline bool vm_page_tryacquire(vm_page_t m, int allocflags) { bool locked; locked = vm_page_trybusy(m, allocflags); if (locked && (allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); return (locked); } /* * vm_page_busy_acquire: * * Acquire the busy lock as described by VM_ALLOC_* flags. Will loop * and drop the object lock if necessary. */ bool vm_page_busy_acquire(vm_page_t m, int allocflags) { vm_object_t obj; bool locked; /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = m->object; for (;;) { if (vm_page_tryacquire(m, allocflags)) return (true); if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (false); if (obj != NULL) locked = VM_OBJECT_WOWNED(obj); else locked = false; MPASS(locked || vm_page_wired(m)); if (_vm_page_busy_sleep(obj, m, m->pindex, "vmpba", allocflags, locked) && locked) VM_OBJECT_WLOCK(obj); if ((allocflags & VM_ALLOC_WAITFAIL) != 0) return (false); KASSERT(m->object == obj || m->object == NULL, ("vm_page_busy_acquire: page %p does not belong to %p", m, obj)); } } /* * vm_page_busy_downgrade: * * Downgrade an exclusive busy page into a single shared busy page. */ void vm_page_busy_downgrade(vm_page_t m) { u_int x; vm_page_assert_xbusied(m); x = vm_page_busy_fetch(m); for (;;) { if (atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_SHARERS_WORD(1))) break; } if ((x & VPB_BIT_WAITERS) != 0) wakeup(m); } /* * * vm_page_busy_tryupgrade: * * Attempt to upgrade a single shared busy into an exclusive busy. */ int vm_page_busy_tryupgrade(vm_page_t m) { u_int ce, x; vm_page_assert_sbusied(m); x = vm_page_busy_fetch(m); ce = VPB_CURTHREAD_EXCLUSIVE; for (;;) { if (VPB_SHARERS(x) > 1) return (0); KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1), ("vm_page_busy_tryupgrade: invalid lock state")); if (!atomic_fcmpset_acq_int(&m->busy_lock, &x, ce | (x & VPB_BIT_WAITERS))) continue; return (1); } } /* * vm_page_sbusied: * * Return a positive value if the page is shared busied, 0 otherwise. */ int vm_page_sbusied(vm_page_t m) { u_int x; x = vm_page_busy_fetch(m); return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED); } /* * vm_page_sunbusy: * * Shared unbusy a page. */ void vm_page_sunbusy(vm_page_t m) { u_int x; vm_page_assert_sbusied(m); x = vm_page_busy_fetch(m); for (;;) { KASSERT(x != VPB_FREED, ("vm_page_sunbusy: Unlocking freed page.")); if (VPB_SHARERS(x) > 1) { if (atomic_fcmpset_int(&m->busy_lock, &x, x - VPB_ONE_SHARER)) break; continue; } KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1), ("vm_page_sunbusy: invalid lock state")); if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED)) continue; if ((x & VPB_BIT_WAITERS) == 0) break; wakeup(m); break; } } /* * vm_page_busy_sleep: * * Sleep if the page is busy, using the page pointer as wchan. * This is used to implement the hard-path of busying mechanism. * * If nonshared is true, sleep only if the page is xbusy. * * The object lock must be held on entry and will be released on exit. */ void vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared) { vm_object_t obj; obj = m->object; VM_OBJECT_ASSERT_LOCKED(obj); vm_page_lock_assert(m, MA_NOTOWNED); if (!_vm_page_busy_sleep(obj, m, m->pindex, wmesg, nonshared ? VM_ALLOC_SBUSY : 0 , true)) VM_OBJECT_DROP(obj); } /* * vm_page_busy_sleep_unlocked: * * Sleep if the page is busy, using the page pointer as wchan. * This is used to implement the hard-path of busying mechanism. * * If nonshared is true, sleep only if the page is xbusy. * * The object lock must not be held on entry. The operation will * return if the page changes identity. */ void vm_page_busy_sleep_unlocked(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, const char *wmesg, bool nonshared) { VM_OBJECT_ASSERT_UNLOCKED(obj); vm_page_lock_assert(m, MA_NOTOWNED); _vm_page_busy_sleep(obj, m, pindex, wmesg, nonshared ? VM_ALLOC_SBUSY : 0, false); } /* * _vm_page_busy_sleep: * * Internal busy sleep function. Verifies the page identity and * lockstate against parameters. Returns true if it sleeps and * false otherwise. * * If locked is true the lock will be dropped for any true returns * and held for any false returns. */ static bool _vm_page_busy_sleep(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked) { bool xsleep; u_int x; /* * If the object is busy we must wait for that to drain to zero * before trying the page again. */ if (obj != NULL && vm_object_busied(obj)) { if (locked) VM_OBJECT_DROP(obj); vm_object_busy_wait(obj, wmesg); return (true); } if (!vm_page_busied(m)) return (false); xsleep = (allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0; sleepq_lock(m); x = vm_page_busy_fetch(m); do { /* * If the page changes objects or becomes unlocked we can * simply return. */ if (x == VPB_UNBUSIED || (xsleep && (x & VPB_BIT_SHARED) != 0) || m->object != obj || m->pindex != pindex) { sleepq_release(m); return (false); } if ((x & VPB_BIT_WAITERS) != 0) break; } while (!atomic_fcmpset_int(&m->busy_lock, &x, x | VPB_BIT_WAITERS)); if (locked) VM_OBJECT_DROP(obj); DROP_GIANT(); sleepq_add(m, NULL, wmesg, 0, 0); sleepq_wait(m, PVM); PICKUP_GIANT(); return (true); } /* * vm_page_trysbusy: * * Try to shared busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_trysbusy(vm_page_t m) { vm_object_t obj; u_int x; obj = m->object; x = vm_page_busy_fetch(m); for (;;) { if ((x & VPB_BIT_SHARED) == 0) return (0); /* * Reduce the window for transient busies that will trigger * false negatives in vm_page_ps_test(). */ if (obj != NULL && vm_object_busied(obj)) return (0); if (atomic_fcmpset_acq_int(&m->busy_lock, &x, x + VPB_ONE_SHARER)) break; } /* Refetch the object now that we're guaranteed that it is stable. */ obj = m->object; if (obj != NULL && vm_object_busied(obj)) { vm_page_sunbusy(m); return (0); } return (1); } /* * vm_page_tryxbusy: * * Try to exclusive busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_tryxbusy(vm_page_t m) { vm_object_t obj; if (atomic_cmpset_acq_int(&m->busy_lock, VPB_UNBUSIED, VPB_CURTHREAD_EXCLUSIVE) == 0) return (0); obj = m->object; if (obj != NULL && vm_object_busied(obj)) { vm_page_xunbusy(m); return (0); } return (1); } static void vm_page_xunbusy_hard_tail(vm_page_t m) { atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); /* Wake the waiter. */ wakeup(m); } /* * vm_page_xunbusy_hard: * * Called when unbusy has failed because there is a waiter. */ void vm_page_xunbusy_hard(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_xunbusy_hard_tail(m); } void vm_page_xunbusy_hard_unchecked(vm_page_t m) { vm_page_assert_xbusied_unchecked(m); vm_page_xunbusy_hard_tail(m); } static void vm_page_busy_free(vm_page_t m) { u_int x; atomic_thread_fence_rel(); x = atomic_swap_int(&m->busy_lock, VPB_FREED); if ((x & VPB_BIT_WAITERS) != 0) wakeup(m); } /* * vm_page_unhold_pages: * * Unhold each of the pages that is referenced by the given array. */ void vm_page_unhold_pages(vm_page_t *ma, int count) { for (; count != 0; count--) { vm_page_unwire(*ma, PQ_ACTIVE); ma++; } } vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa) { vm_page_t m; #ifdef VM_PHYSSEG_SPARSE m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) m = vm_phys_fictitious_to_vm_page(pa); return (m); #elif defined(VM_PHYSSEG_DENSE) long pi; pi = atop(pa); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { m = &vm_page_array[pi - first_page]; return (m); } return (vm_phys_fictitious_to_vm_page(pa)); #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif } /* * vm_page_getfake: * * Create a fictitious page with the specified physical address and * memory attribute. The memory attribute is the only the machine- * dependent aspect of a fictitious page that must be initialized. */ vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) { vm_page_t m; m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); vm_page_initfake(m, paddr, memattr); return (m); } void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { if ((m->flags & PG_FICTITIOUS) != 0) { /* * The page's memattr might have changed since the * previous initialization. Update the pmap to the * new memattr. */ goto memattr; } m->phys_addr = paddr; m->a.queue = PQ_NONE; /* Fictitious pages don't use "segind". */ m->flags = PG_FICTITIOUS; /* Fictitious pages don't use "order" or "pool". */ m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_CURTHREAD_EXCLUSIVE; /* Fictitious pages are unevictable. */ m->ref_count = 1; pmap_page_init(m); memattr: pmap_page_set_memattr(m, memattr); } /* * vm_page_putfake: * * Release a fictitious page. */ void vm_page_putfake(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_putfake: bad page %p", m)); vm_page_assert_xbusied(m); vm_page_busy_free(m); uma_zfree(fakepg_zone, m); } /* * vm_page_updatefake: * * Update the given fictitious page to the specified physical address and * memory attribute. */ void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_updatefake: bad page %p", m)); m->phys_addr = paddr; pmap_page_set_memattr(m, memattr); } /* * vm_page_free: * * Free a page. */ void vm_page_free(vm_page_t m) { m->flags &= ~PG_ZERO; vm_page_free_toq(m); } /* * vm_page_free_zero: * * Free a page to the zerod-pages queue */ void vm_page_free_zero(vm_page_t m) { m->flags |= PG_ZERO; vm_page_free_toq(m); } /* * Unbusy and handle the page queueing for a page from a getpages request that * was optionally read ahead or behind. */ void vm_page_readahead_finish(vm_page_t m) { /* We shouldn't put invalid pages on queues. */ KASSERT(!vm_page_none_valid(m), ("%s: %p is invalid", __func__, m)); /* * Since the page is not the actually needed one, whether it should * be activated or deactivated is not obvious. Empirical results * have shown that deactivating the page is usually the best choice, * unless the page is wanted by another thread. */ if ((vm_page_busy_fetch(m) & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); vm_page_xunbusy_unchecked(m); } /* * Destroy the identity of an invalid page and free it if possible. * This is intended to be used when reading a page from backing store fails. */ void vm_page_free_invalid(vm_page_t m) { KASSERT(vm_page_none_valid(m), ("page %p is valid", m)); KASSERT(!pmap_page_is_mapped(m), ("page %p is mapped", m)); KASSERT(m->object != NULL, ("page %p has no object", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); /* * We may be attempting to free the page as part of the handling for an * I/O error, in which case the page was xbusied by a different thread. */ vm_page_xbusy_claim(m); /* * If someone has wired this page while the object lock * was not held, then the thread that unwires is responsible * for freeing the page. Otherwise just free the page now. * The wire count of this unmapped page cannot change while * we have the page xbusy and the page's object wlocked. */ if (vm_page_remove(m)) vm_page_free(m); } /* * vm_page_sleep_if_busy: * * Sleep and release the object lock if the page is busied. * Returns TRUE if the thread slept. * * The given page must be unlocked and object containing it must * be locked. */ int vm_page_sleep_if_busy(vm_page_t m, const char *wmesg) { vm_object_t obj; vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = m->object; if (_vm_page_busy_sleep(obj, m, m->pindex, wmesg, 0, true)) { VM_OBJECT_WLOCK(obj); return (TRUE); } return (FALSE); } /* * vm_page_sleep_if_xbusy: * * Sleep and release the object lock if the page is xbusied. * Returns TRUE if the thread slept. * * The given page must be unlocked and object containing it must * be locked. */ int vm_page_sleep_if_xbusy(vm_page_t m, const char *wmesg) { vm_object_t obj; vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = m->object; if (_vm_page_busy_sleep(obj, m, m->pindex, wmesg, VM_ALLOC_SBUSY, true)) { VM_OBJECT_WLOCK(obj); return (TRUE); } return (FALSE); } /* * vm_page_dirty_KBI: [ internal use only ] * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). * * This function should only be called by vm_page_dirty(). */ void vm_page_dirty_KBI(vm_page_t m) { /* Refer to this operation by its public name. */ KASSERT(vm_page_all_valid(m), ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. * * The object must be locked. */ int vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { vm_page_t mpred; VM_OBJECT_ASSERT_WLOCKED(object); mpred = vm_radix_lookup_le(&object->rtree, pindex); return (vm_page_insert_after(m, object, pindex, mpred)); } /* * vm_page_insert_after: * * Inserts the page "m" into the specified object at offset "pindex". * * The page "mpred" must immediately precede the offset "pindex" within * the specified object. * * The object must be locked. */ static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) { vm_page_t msucc; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(m->object == NULL, ("vm_page_insert_after: page already inserted")); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) KASSERT(msucc->pindex > pindex, ("vm_page_insert_after: msucc doesn't succeed pindex")); /* * Record the object/offset pair in this page. */ m->object = object; m->pindex = pindex; m->ref_count |= VPRC_OBJREF; /* * Now link into the object's ordered list of backed pages. */ if (vm_radix_insert(&object->rtree, m)) { m->object = NULL; m->pindex = 0; m->ref_count &= ~VPRC_OBJREF; return (1); } vm_page_insert_radixdone(m, object, mpred); return (0); } /* * vm_page_insert_radixdone: * * Complete page "m" insertion into the specified object after the * radix trie hooking. * * The page "mpred" must precede the offset "m->pindex" within the * specified object. * * The object must be locked. */ static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object != NULL && m->object == object, ("vm_page_insert_radixdone: page %p has inconsistent object", m)); KASSERT((m->ref_count & VPRC_OBJREF) != 0, ("vm_page_insert_radixdone: page %p is missing object ref", m)); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_radixdone: object doesn't contain mpred")); KASSERT(mpred->pindex < m->pindex, ("vm_page_insert_radixdone: mpred doesn't precede pindex")); } if (mpred != NULL) TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq); else TAILQ_INSERT_HEAD(&object->memq, m, listq); /* * Show that the object has one more resident page. */ object->resident_page_count++; /* * Hold the vnode until the last page is released. */ if (object->resident_page_count == 1 && object->type == OBJT_VNODE) vhold(object->handle); /* * Since we are inserting a new and possibly dirty page, * update the object's generation count. */ if (pmap_page_is_write_mapped(m)) vm_object_set_writeable_dirty(object); } /* * Do the work to remove a page from its object. The caller is responsible for * updating the page's fields to reflect this removal. */ static void vm_page_object_remove(vm_page_t m) { vm_object_t object; vm_page_t mrem; vm_page_assert_xbusied(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((m->ref_count & VPRC_OBJREF) != 0, ("page %p is missing its object ref", m)); /* Deferred free of swap space. */ if ((m->a.flags & PGA_SWAP_FREE) != 0) vm_pager_page_unswapped(m); m->object = NULL; mrem = vm_radix_remove(&object->rtree, m->pindex); KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m)); /* * Now remove from the object's list of backed pages. */ TAILQ_REMOVE(&object->memq, m, listq); /* * And show that the object has one fewer resident page. */ object->resident_page_count--; /* * The vnode may now be recycled. */ if (object->resident_page_count == 0 && object->type == OBJT_VNODE) vdrop(object->handle); } /* * vm_page_remove: * * Removes the specified page from its containing object, but does not * invalidate any backing storage. Returns true if the object's reference * was the last reference to the page, and false otherwise. * * The object must be locked and the page must be exclusively busied. * The exclusive busy will be released on return. If this is not the * final ref and the caller does not hold a wire reference it may not * continue to access the page. */ bool vm_page_remove(vm_page_t m) { bool dropped; dropped = vm_page_remove_xbusy(m); vm_page_xunbusy(m); return (dropped); } /* * vm_page_remove_xbusy * * Removes the page but leaves the xbusy held. Returns true if this * removed the final ref and false otherwise. */ bool vm_page_remove_xbusy(vm_page_t m) { vm_page_object_remove(m); return (vm_page_drop(m, VPRC_OBJREF) == VPRC_OBJREF); } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { VM_OBJECT_ASSERT_LOCKED(object); return (vm_radix_lookup(&object->rtree, pindex)); } /* * vm_page_lookup_unlocked: * * Returns the page associated with the object/offset pair specified; * if none is found, NULL is returned. The page may be no longer be * present in the object at the time that this function returns. Only * useful for opportunistic checks such as inmem(). */ vm_page_t vm_page_lookup_unlocked(vm_object_t object, vm_pindex_t pindex) { return (vm_radix_lookup_unlocked(&object->rtree, pindex)); } /* * vm_page_relookup: * * Returns a page that must already have been busied by * the caller. Used for bogus page replacement. */ vm_page_t vm_page_relookup(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; m = vm_radix_lookup_unlocked(&object->rtree, pindex); KASSERT(m != NULL && (vm_page_busied(m) || vm_page_wired(m)) && m->object == object && m->pindex == pindex, ("vm_page_relookup: Invalid page %p", m)); return (m); } /* * This should only be used by lockless functions for releasing transient * incorrect acquires. The page may have been freed after we acquired a * busy lock. In this case busy_lock == VPB_FREED and we have nothing * further to do. */ static void vm_page_busy_release(vm_page_t m) { u_int x; x = vm_page_busy_fetch(m); for (;;) { if (x == VPB_FREED) break; if ((x & VPB_BIT_SHARED) != 0 && VPB_SHARERS(x) > 1) { if (atomic_fcmpset_int(&m->busy_lock, &x, x - VPB_ONE_SHARER)) break; continue; } KASSERT((x & VPB_BIT_SHARED) != 0 || (x & ~VPB_BIT_WAITERS) == VPB_CURTHREAD_EXCLUSIVE, ("vm_page_busy_release: %p xbusy not owned.", m)); if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED)) continue; if ((x & VPB_BIT_WAITERS) != 0) wakeup(m); break; } } /* * vm_page_find_least: * * Returns the page associated with the object with least pindex * greater than or equal to the parameter pindex, or NULL. * * The object must be locked. */ vm_page_t vm_page_find_least(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; VM_OBJECT_ASSERT_LOCKED(object); if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex) m = vm_radix_lookup_ge(&object->rtree, pindex); return (m); } /* * Returns the given page's successor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_next(vm_page_t m) { vm_page_t next; VM_OBJECT_ASSERT_LOCKED(m->object); if ((next = TAILQ_NEXT(m, listq)) != NULL) { MPASS(next->object == m->object); if (next->pindex != m->pindex + 1) next = NULL; } return (next); } /* * Returns the given page's predecessor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_prev(vm_page_t m) { vm_page_t prev; VM_OBJECT_ASSERT_LOCKED(m->object); if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) { MPASS(prev->object == m->object); if (prev->pindex != m->pindex - 1) prev = NULL; } return (prev); } /* * Uses the page mnew as a replacement for an existing page at index * pindex which must be already present in the object. * * Both pages must be exclusively busied on enter. The old page is * unbusied on exit. * * A return value of true means mold is now free. If this is not the * final ref and the caller does not hold a wire reference it may not * continue to access the page. */ static bool vm_page_replace_hold(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, vm_page_t mold) { vm_page_t mret; bool dropped; VM_OBJECT_ASSERT_WLOCKED(object); vm_page_assert_xbusied(mold); KASSERT(mnew->object == NULL && (mnew->ref_count & VPRC_OBJREF) == 0, ("vm_page_replace: page %p already in object", mnew)); /* * This function mostly follows vm_page_insert() and * vm_page_remove() without the radix, object count and vnode * dance. Double check such functions for more comments. */ mnew->object = object; mnew->pindex = pindex; atomic_set_int(&mnew->ref_count, VPRC_OBJREF); mret = vm_radix_replace(&object->rtree, mnew); KASSERT(mret == mold, ("invalid page replacement, mold=%p, mret=%p", mold, mret)); KASSERT((mold->oflags & VPO_UNMANAGED) == (mnew->oflags & VPO_UNMANAGED), ("vm_page_replace: mismatched VPO_UNMANAGED")); /* Keep the resident page list in sorted order. */ TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq); TAILQ_REMOVE(&object->memq, mold, listq); mold->object = NULL; /* * The object's resident_page_count does not change because we have * swapped one page for another, but the generation count should * change if the page is dirty. */ if (pmap_page_is_write_mapped(mnew)) vm_object_set_writeable_dirty(object); dropped = vm_page_drop(mold, VPRC_OBJREF) == VPRC_OBJREF; vm_page_xunbusy(mold); return (dropped); } void vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, vm_page_t mold) { vm_page_assert_xbusied(mnew); if (vm_page_replace_hold(mnew, object, pindex, mold)) vm_page_free(mold); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * Note: swap associated with the page must be invalidated by the move. We * have to do this for several reasons: (1) we aren't freeing the * page, (2) we are dirtying the page, (3) the VM system is probably * moving the page from object A to B, and will then later move * the backing store from A to B and we can't have a conflict. * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating * swap. * * The objects must be locked. */ int vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) { vm_page_t mpred; vm_pindex_t opidx; VM_OBJECT_ASSERT_WLOCKED(new_object); KASSERT(m->ref_count != 0, ("vm_page_rename: page %p has no refs", m)); mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex); KASSERT(mpred == NULL || mpred->pindex != new_pindex, ("vm_page_rename: pindex already renamed")); /* * Create a custom version of vm_page_insert() which does not depend * by m_prev and can cheat on the implementation aspects of the * function. */ opidx = m->pindex; m->pindex = new_pindex; if (vm_radix_insert(&new_object->rtree, m)) { m->pindex = opidx; return (1); } /* * The operation cannot fail anymore. The removal must happen before * the listq iterator is tainted. */ m->pindex = opidx; vm_page_object_remove(m); /* Return back to the new pindex to complete vm_page_insert(). */ m->pindex = new_pindex; m->object = new_object; vm_page_insert_radixdone(m, new_object, mpred); vm_page_dirty(m); return (0); } /* * vm_page_alloc: * * Allocate and return a page that is associated with the specified * object and offset pair. By default, this page is exclusive busied. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { return (vm_page_alloc_after(object, pindex, req, object != NULL ? vm_radix_lookup_le(&object->rtree, pindex) : NULL)); } vm_page_t vm_page_alloc_domain(vm_object_t object, vm_pindex_t pindex, int domain, int req) { return (vm_page_alloc_domain_after(object, pindex, domain, req, object != NULL ? vm_radix_lookup_le(&object->rtree, pindex) : NULL)); } /* * Allocate a page in the specified object with the given page index. To * optimize insertion of the page into the object, the caller must also specifiy * the resident page in the object with largest index smaller than the given * page index, or NULL if no such page exists. */ vm_page_t vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, int req, vm_page_t mpred) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); do { m = vm_page_alloc_domain_after(object, pindex, domain, req, mpred); if (m != NULL) break; } while (vm_domainset_iter_page(&di, object, &domain) == 0); return (m); } /* * Returns true if the number of free pages exceeds the minimum * for the request class and false otherwise. */ static int _vm_domain_allocate(struct vm_domain *vmd, int req_class, int npages) { u_int limit, old, new; if (req_class == VM_ALLOC_INTERRUPT) limit = 0; else if (req_class == VM_ALLOC_SYSTEM) limit = vmd->vmd_interrupt_free_min; else limit = vmd->vmd_free_reserved; /* * Attempt to reserve the pages. Fail if we're below the limit. */ limit += npages; old = vmd->vmd_free_count; do { if (old < limit) return (0); new = old - npages; } while (atomic_fcmpset_int(&vmd->vmd_free_count, &old, new) == 0); /* Wake the page daemon if we've crossed the threshold. */ if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old)) pagedaemon_wakeup(vmd->vmd_domain); /* Only update bitsets on transitions. */ if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) || (old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe)) vm_domain_set(vmd); return (1); } int vm_domain_allocate(struct vm_domain *vmd, int req, int npages) { int req_class; /* * The page daemon is allowed to dig deeper into the free page list. */ req_class = req & VM_ALLOC_CLASS_MASK; if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; return (_vm_domain_allocate(vmd, req_class, npages)); } vm_page_t vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain, int req, vm_page_t mpred) { struct vm_domain *vmd; vm_page_t m; int flags, pool; KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("inconsistent object(%p)/req(%x)", object, req)); KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0, ("Can't sleep and retry object insertion.")); KASSERT(mpred == NULL || mpred->pindex < pindex, ("mpred %p doesn't precede pindex 0x%jx", mpred, (uintmax_t)pindex)); if (object != NULL) VM_OBJECT_ASSERT_WLOCKED(object); flags = 0; m = NULL; pool = object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT; again: #if VM_NRESERVLEVEL > 0 /* * Can we allocate the page from a reservation? */ if (vm_object_reserv(object) && (m = vm_reserv_alloc_page(object, pindex, domain, req, mpred)) != NULL) { goto found; } #endif vmd = VM_DOMAIN(domain); if (vmd->vmd_pgcache[pool].zone != NULL) { m = uma_zalloc(vmd->vmd_pgcache[pool].zone, M_NOWAIT | M_NOVM); if (m != NULL) { flags |= PG_PCPU_CACHE; goto found; } } if (vm_domain_allocate(vmd, req, 1)) { /* * If not, allocate it from the free page queues. */ vm_domain_free_lock(vmd); m = vm_phys_alloc_pages(domain, pool, 0); vm_domain_free_unlock(vmd); if (m == NULL) { vm_domain_freecnt_inc(vmd, 1); #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_inactive(domain)) goto again; #endif } } if (m == NULL) { /* * Not allocatable, give up. */ if (vm_domain_alloc_fail(vmd, object, req)) goto again; return (NULL); } /* * At this point we had better have found a good page. */ found: vm_page_dequeue(m); vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ if ((req & VM_ALLOC_ZERO) != 0) flags |= (m->flags & PG_ZERO); if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; m->flags = flags; m->a.flags = 0; m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) m->busy_lock = VPB_CURTHREAD_EXCLUSIVE; else if ((req & VM_ALLOC_SBUSY) != 0) m->busy_lock = VPB_SHARERS_WORD(1); else m->busy_lock = VPB_UNBUSIED; if (req & VM_ALLOC_WIRED) { vm_wire_add(1); m->ref_count = 1; } m->a.act_count = 0; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { if (req & VM_ALLOC_WIRED) { vm_wire_sub(1); m->ref_count = 0; } KASSERT(m->object == NULL, ("page %p has object", m)); m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); if (req & VM_ALLOC_WAITFAIL) { VM_OBJECT_WUNLOCK(object); vm_radix_wait(); VM_OBJECT_WLOCK(object); } return (NULL); } /* Ignore device objects; the pager sets "memattr" for them. */ if (object->memattr != VM_MEMATTR_DEFAULT && (object->flags & OBJ_FICTITIOUS) == 0) pmap_page_set_memattr(m, object->memattr); } else m->pindex = pindex; return (m); } /* * vm_page_alloc_contig: * * Allocate a contiguous set of physical pages of the given size "npages" * from the free lists. All of the physical pages must be at or above * the given physical address "low" and below the given physical address * "high". The given value "alignment" determines the alignment of the * first physical page in the set. If the given value "boundary" is * non-zero, then the set of physical pages cannot cross any physical * address boundary that is a multiple of that value. Both "alignment" * and "boundary" must be a power of two. * * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, * then the memory attribute setting for the physical pages is configured * to the object's memory attribute setting. Otherwise, the memory * attribute setting for the physical pages is configured to "memattr", * overriding the object's memory attribute setting. However, if the * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the * memory attribute setting for the physical pages cannot be configured * to VM_MEMATTR_DEFAULT. * * The specified object may not contain fictitious pages. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); do { m = vm_page_alloc_contig_domain(object, pindex, domain, req, npages, low, high, alignment, boundary, memattr); if (m != NULL) break; } while (vm_domainset_iter_page(&di, object, &domain) == 0); return (m); } vm_page_t vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domain *vmd; vm_page_t m, m_ret, mpred; u_int busy_lock, flags, oflags; mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object, req)); KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0, ("Can't sleep and retry object insertion.")); if (object != NULL) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_FICTITIOUS) == 0, ("vm_page_alloc_contig: object %p has fictitious pages", object)); } KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); if (object != NULL) { mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, ("vm_page_alloc_contig: pindex already allocated")); } /* * Can we allocate the pages without the number of free pages falling * below the lower bound for the allocation class? */ m_ret = NULL; again: #if VM_NRESERVLEVEL > 0 /* * Can we allocate the pages from a reservation? */ if (vm_object_reserv(object) && (m_ret = vm_reserv_alloc_contig(object, pindex, domain, req, mpred, npages, low, high, alignment, boundary)) != NULL) { goto found; } #endif vmd = VM_DOMAIN(domain); if (vm_domain_allocate(vmd, req, npages)) { /* * allocate them from the free page queues. */ vm_domain_free_lock(vmd); m_ret = vm_phys_alloc_contig(domain, npages, low, high, alignment, boundary); vm_domain_free_unlock(vmd); if (m_ret == NULL) { vm_domain_freecnt_inc(vmd, npages); #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_contig(domain, npages, low, high, alignment, boundary)) goto again; #endif } } if (m_ret == NULL) { if (vm_domain_alloc_fail(vmd, object, req)) goto again; return (NULL); } #if VM_NRESERVLEVEL > 0 found: #endif for (m = m_ret; m < &m_ret[npages]; m++) { vm_page_dequeue(m); vm_page_alloc_check(m); } /* * Initialize the pages. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) busy_lock = VPB_CURTHREAD_EXCLUSIVE; else if ((req & VM_ALLOC_SBUSY) != 0) busy_lock = VPB_SHARERS_WORD(1); else busy_lock = VPB_UNBUSIED; if ((req & VM_ALLOC_WIRED) != 0) vm_wire_add(npages); if (object != NULL) { if (object->memattr != VM_MEMATTR_DEFAULT && memattr == VM_MEMATTR_DEFAULT) memattr = object->memattr; } for (m = m_ret; m < &m_ret[npages]; m++) { m->a.flags = 0; m->flags = (m->flags | PG_NODUMP) & flags; m->busy_lock = busy_lock; if ((req & VM_ALLOC_WIRED) != 0) m->ref_count = 1; m->a.act_count = 0; m->oflags = oflags; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { if ((req & VM_ALLOC_WIRED) != 0) vm_wire_sub(npages); KASSERT(m->object == NULL, ("page %p has object", m)); mpred = m; for (m = m_ret; m < &m_ret[npages]; m++) { if (m <= mpred && (req & VM_ALLOC_WIRED) != 0) m->ref_count = 0; m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); } if (req & VM_ALLOC_WAITFAIL) { VM_OBJECT_WUNLOCK(object); vm_radix_wait(); VM_OBJECT_WLOCK(object); } return (NULL); } mpred = m; } else m->pindex = pindex; if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); pindex++; } return (m_ret); } /* * Check a page that has been freshly dequeued from a freelist. */ static void vm_page_alloc_check(vm_page_t m) { KASSERT(m->object == NULL, ("page %p has object", m)); KASSERT(m->a.queue == PQ_NONE && (m->a.flags & PGA_QUEUE_STATE_MASK) == 0, ("page %p has unexpected queue %d, flags %#x", m, m->a.queue, (m->a.flags & PGA_QUEUE_STATE_MASK))); KASSERT(m->ref_count == 0, ("page %p has references", m)); KASSERT(vm_page_busy_freed(m), ("page %p is not freed", m)); KASSERT(m->dirty == 0, ("page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); KASSERT(m->valid == 0, ("free page %p is valid", m)); } /* * vm_page_alloc_freelist: * * Allocate a physical page from the specified free page list. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc_freelist(int freelist, int req) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { m = vm_page_alloc_freelist_domain(domain, freelist, req); if (m != NULL) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (m); } vm_page_t vm_page_alloc_freelist_domain(int domain, int freelist, int req) { struct vm_domain *vmd; vm_page_t m; u_int flags; m = NULL; vmd = VM_DOMAIN(domain); again: if (vm_domain_allocate(vmd, req, 1)) { vm_domain_free_lock(vmd); m = vm_phys_alloc_freelist_pages(domain, freelist, VM_FREEPOOL_DIRECT, 0); vm_domain_free_unlock(vmd); if (m == NULL) vm_domain_freecnt_inc(vmd, 1); } if (m == NULL) { if (vm_domain_alloc_fail(vmd, NULL, req)) goto again; return (NULL); } vm_page_dequeue(m); vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ m->a.flags = 0; flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; m->flags &= flags; if ((req & VM_ALLOC_WIRED) != 0) { vm_wire_add(1); m->ref_count = 1; } /* Unmanaged pages don't use "act_count". */ m->oflags = VPO_UNMANAGED; return (m); } static int vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags) { struct vm_domain *vmd; struct vm_pgcache *pgcache; int i; pgcache = arg; vmd = VM_DOMAIN(pgcache->domain); /* * The page daemon should avoid creating extra memory pressure since its * main purpose is to replenish the store of free pages. */ if (vmd->vmd_severeset || curproc == pageproc || !_vm_domain_allocate(vmd, VM_ALLOC_NORMAL, cnt)) return (0); domain = vmd->vmd_domain; vm_domain_free_lock(vmd); i = vm_phys_alloc_npages(domain, pgcache->pool, cnt, (vm_page_t *)store); vm_domain_free_unlock(vmd); if (cnt != i) vm_domain_freecnt_inc(vmd, cnt - i); return (i); } static void vm_page_zone_release(void *arg, void **store, int cnt) { struct vm_domain *vmd; struct vm_pgcache *pgcache; vm_page_t m; int i; pgcache = arg; vmd = VM_DOMAIN(pgcache->domain); vm_domain_free_lock(vmd); for (i = 0; i < cnt; i++) { m = (vm_page_t)store[i]; vm_phys_free_pages(m, 0); } vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, cnt); } #define VPSC_ANY 0 /* No restrictions. */ #define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ #define VPSC_NOSUPER 2 /* Skip superpages. */ /* * vm_page_scan_contig: * * Scan vm_page_array[] between the specified entries "m_start" and * "m_end" for a run of contiguous physical pages that satisfy the * specified conditions, and return the lowest page in the run. The * specified "alignment" determines the alignment of the lowest physical * page in the run. If the specified "boundary" is non-zero, then the * run of physical pages cannot span a physical address that is a * multiple of "boundary". * * "m_end" is never dereferenced, so it need not point to a vm_page * structure within vm_page_array[]. * * "npages" must be greater than zero. "m_start" and "m_end" must not * span a hole (or discontiguity) in the physical address space. Both * "alignment" and "boundary" must be a power of two. */ vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options) { vm_object_t object; vm_paddr_t pa; vm_page_t m, m_run; #if VM_NRESERVLEVEL > 0 int level; #endif int m_inc, order, run_ext, run_len; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); m_run = NULL; run_len = 0; for (m = m_start; m < m_end && run_len < npages; m += m_inc) { KASSERT((m->flags & PG_MARKER) == 0, ("page %p is PG_MARKER", m)); KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->ref_count >= 1, ("fictitious page %p has invalid ref count", m)); /* * If the current page would be the start of a run, check its * physical address against the end, alignment, and boundary * conditions. If it doesn't satisfy these conditions, either * terminate the scan or advance to the next page that * satisfies the failed condition. */ if (run_len == 0) { KASSERT(m_run == NULL, ("m_run != NULL")); if (m + npages > m_end) break; pa = VM_PAGE_TO_PHYS(m); if ((pa & (alignment - 1)) != 0) { m_inc = atop(roundup2(pa, alignment) - pa); continue; } if (rounddown2(pa ^ (pa + ptoa(npages) - 1), boundary) != 0) { m_inc = atop(roundup2(pa, boundary) - pa); continue; } } else KASSERT(m_run != NULL, ("m_run == NULL")); retry: m_inc = 1; if (vm_page_wired(m)) run_ext = 0; #if VM_NRESERVLEVEL > 0 else if ((level = vm_reserv_level(m)) >= 0 && (options & VPSC_NORESERV) != 0) { run_ext = 0; /* Advance to the end of the reservation. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); } #endif else if ((object = atomic_load_ptr(&m->object)) != NULL) { /* * The page is considered eligible for relocation if * and only if it could be laundered or reclaimed by * the page daemon. */ VM_OBJECT_RLOCK(object); if (object != m->object) { VM_OBJECT_RUNLOCK(object); goto retry; } /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) { run_ext = 0; #if VM_NRESERVLEVEL > 0 } else if ((options & VPSC_NOSUPER) != 0 && (level = vm_reserv_level_iffullpop(m)) >= 0) { run_ext = 0; /* Advance to the end of the superpage. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); #endif } else if (object->memattr == VM_MEMATTR_DEFAULT && vm_page_queue(m) != PQ_NONE && !vm_page_busied(m)) { /* * The page is allocated but eligible for * relocation. Extend the current run by one * page. */ KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: PGA_NOSYNC. */ run_ext = 1; } else run_ext = 0; VM_OBJECT_RUNLOCK(object); #if VM_NRESERVLEVEL > 0 } else if (level >= 0) { /* * The page is reserved but not yet allocated. In * other words, it is still free. Extend the current * run by one page. */ run_ext = 1; #endif } else if ((order = m->order) < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it is the * first page in a power-of-two-sized run of * contiguous free pages. Add these pages to the end * of the current run, and jump ahead. */ run_ext = 1 << order; m_inc = 1 << order; } else { /* * Skip the page for one of the following reasons: (1) * It is enqueued in the physical memory allocator's * free page queues. However, it is not the first * page in a run of contiguous free pages. (This case * rarely occurs because the scan is performed in * ascending order.) (2) It is not reserved, and it is * transitioning from free to allocated. (Conversely, * the transition from allocated to free for managed * pages is blocked by the page busy lock.) (3) It is * allocated but not contained by an object and not * wired, e.g., allocated by Xen's balloon driver. */ run_ext = 0; } /* * Extend or reset the current run of pages. */ if (run_ext > 0) { if (run_len == 0) m_run = m; run_len += run_ext; } else { if (run_len > 0) { m_run = NULL; run_len = 0; } } } if (run_len >= npages) return (m_run); return (NULL); } /* * vm_page_reclaim_run: * * Try to relocate each of the allocated virtual pages within the * specified run of physical pages to a new physical address. Free the * physical pages underlying the relocated virtual pages. A virtual page * is relocatable if and only if it could be laundered or reclaimed by * the page daemon. Whenever possible, a virtual page is relocated to a * physical address above "high". * * Returns 0 if every physical page within the run was already free or * just freed by a successful relocation. Otherwise, returns a non-zero * value indicating why the last attempt to relocate a virtual page was * unsuccessful. * * "req_class" must be an allocation class. */ static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high) { struct vm_domain *vmd; struct spglist free; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_end, m_new; int error, order, req; KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class, ("req_class is not an allocation class")); SLIST_INIT(&free); error = 0; m = m_run; m_end = m_run + npages; for (; error == 0 && m < m_end; m++) { KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, ("page %p is PG_FICTITIOUS or PG_MARKER", m)); /* * Racily check for wirings. Races are handled once the object * lock is held and the page is unmapped. */ if (vm_page_wired(m)) error = EBUSY; else if ((object = atomic_load_ptr(&m->object)) != NULL) { /* * The page is relocated if and only if it could be * laundered or reclaimed by the page daemon. */ VM_OBJECT_WLOCK(object); /* Don't care: PG_NODUMP, PG_ZERO. */ if (m->object != object || (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE)) error = EINVAL; else if (object->memattr != VM_MEMATTR_DEFAULT) error = EINVAL; else if (vm_page_queue(m) != PQ_NONE && vm_page_tryxbusy(m) != 0) { if (vm_page_wired(m)) { vm_page_xunbusy(m); error = EBUSY; goto unlock; } KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT(m->oflags == 0, ("page %p has unexpected oflags", m)); /* Don't care: PGA_NOSYNC. */ if (!vm_page_none_valid(m)) { /* * First, try to allocate a new page * that is above "high". Failing * that, try to allocate a new page * that is below "m_run". Allocate * the new page between the end of * "m_run" and "high" only as a last * resort. */ req = req_class | VM_ALLOC_NOOBJ; if ((m->flags & PG_NODUMP) != 0) req |= VM_ALLOC_NODUMP; if (trunc_page(high) != ~(vm_paddr_t)PAGE_MASK) { m_new = vm_page_alloc_contig( NULL, 0, req, 1, round_page(high), ~(vm_paddr_t)0, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } else m_new = NULL; if (m_new == NULL) { pa = VM_PAGE_TO_PHYS(m_run); m_new = vm_page_alloc_contig( NULL, 0, req, 1, 0, pa - 1, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { pa += ptoa(npages); m_new = vm_page_alloc_contig( NULL, 0, req, 1, pa, high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { vm_page_xunbusy(m); error = ENOMEM; goto unlock; } /* * Unmap the page and check for new * wirings that may have been acquired * through a pmap lookup. */ if (object->ref_count != 0 && !vm_page_try_remove_all(m)) { vm_page_xunbusy(m); vm_page_free(m_new); error = EBUSY; goto unlock; } /* * Replace "m" with the new page. For * vm_page_replace(), "m" must be busy * and dequeued. Finally, change "m" * as if vm_page_free() was called. */ m_new->a.flags = m->a.flags & ~PGA_QUEUE_STATE_MASK; KASSERT(m_new->oflags == VPO_UNMANAGED, ("page %p is managed", m_new)); m_new->oflags = 0; pmap_copy_page(m, m_new); m_new->valid = m->valid; m_new->dirty = m->dirty; m->flags &= ~PG_ZERO; vm_page_dequeue(m); if (vm_page_replace_hold(m_new, object, m->pindex, m) && vm_page_free_prep(m)) SLIST_INSERT_HEAD(&free, m, plinks.s.ss); /* * The new page must be deactivated * before the object is unlocked. */ vm_page_deactivate(m_new); } else { m->flags &= ~PG_ZERO; vm_page_dequeue(m); if (vm_page_free_prep(m)) SLIST_INSERT_HEAD(&free, m, plinks.s.ss); KASSERT(m->dirty == 0, ("page %p is dirty", m)); } } else error = EBUSY; unlock: VM_OBJECT_WUNLOCK(object); } else { MPASS(vm_phys_domain(m) == domain); vmd = VM_DOMAIN(domain); vm_domain_free_lock(vmd); order = m->order; if (order < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it * is the first page in a power-of-two-sized * run of contiguous free pages. Jump ahead * to the last page within that run, and * continue from there. */ m += (1 << order) - 1; } #if VM_NRESERVLEVEL > 0 else if (vm_reserv_is_page_free(m)) order = 0; #endif vm_domain_free_unlock(vmd); if (order == VM_NFREEORDER) error = EINVAL; } } if ((m = SLIST_FIRST(&free)) != NULL) { int cnt; vmd = VM_DOMAIN(domain); cnt = 0; vm_domain_free_lock(vmd); do { MPASS(vm_phys_domain(m) == domain); SLIST_REMOVE_HEAD(&free, plinks.s.ss); vm_phys_free_pages(m, 0); cnt++; } while ((m = SLIST_FIRST(&free)) != NULL); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, cnt); } return (error); } #define NRUNS 16 CTASSERT(powerof2(NRUNS)); #define RUN_INDEX(count) ((count) & (NRUNS - 1)) #define MIN_RECLAIM 8 /* * vm_page_reclaim_contig: * * Reclaim allocated, contiguous physical memory satisfying the specified * conditions by relocating the virtual pages using that physical memory. * Returns true if reclamation is successful and false otherwise. Since * relocation requires the allocation of physical pages, reclamation may * fail due to a shortage of free pages. When reclamation fails, callers * are expected to perform vm_wait() before retrying a failed allocation * operation, e.g., vm_page_alloc_contig(). * * The caller must always specify an allocation class through "req". * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * The optional allocation flags are ignored. * * "npages" must be greater than zero. Both "alignment" and "boundary" * must be a power of two. */ bool vm_page_reclaim_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domain *vmd; vm_paddr_t curr_low; vm_page_t m_run, m_runs[NRUNS]; u_long count, reclaimed; int error, i, options, req_class; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* * Return if the number of free pages cannot satisfy the requested * allocation. */ vmd = VM_DOMAIN(domain); count = vmd->vmd_free_count; if (count < npages + vmd->vmd_free_reserved || (count < npages + vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || (count < npages && req_class == VM_ALLOC_INTERRUPT)) return (false); /* * Scan up to three times, relaxing the restrictions ("options") on * the reclamation of reservations and superpages each time. */ for (options = VPSC_NORESERV;;) { /* * Find the highest runs that satisfy the given constraints * and restrictions, and record them in "m_runs". */ curr_low = low; count = 0; for (;;) { m_run = vm_phys_scan_contig(domain, npages, curr_low, high, alignment, boundary, options); if (m_run == NULL) break; curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages); m_runs[RUN_INDEX(count)] = m_run; count++; } /* * Reclaim the highest runs in LIFO (descending) order until * the number of reclaimed pages, "reclaimed", is at least * MIN_RECLAIM. Reset "reclaimed" each time because each * reclamation is idempotent, and runs will (likely) recur * from one scan to the next as restrictions are relaxed. */ reclaimed = 0; for (i = 0; count > 0 && i < NRUNS; i++) { count--; m_run = m_runs[RUN_INDEX(count)]; error = vm_page_reclaim_run(req_class, domain, npages, m_run, high); if (error == 0) { reclaimed += npages; if (reclaimed >= MIN_RECLAIM) return (true); } } /* * Either relax the restrictions on the next scan or return if * the last scan had no restrictions. */ if (options == VPSC_NORESERV) options = VPSC_NOSUPER; else if (options == VPSC_NOSUPER) options = VPSC_ANY; else if (options == VPSC_ANY) return (reclaimed != 0); } } bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domainset_iter di; int domain; bool ret; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { ret = vm_page_reclaim_contig_domain(domain, req, npages, low, high, alignment, boundary); if (ret) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (ret); } /* * Set the domain in the appropriate page level domainset. */ void vm_domain_set(struct vm_domain *vmd) { mtx_lock(&vm_domainset_lock); if (!vmd->vmd_minset && vm_paging_min(vmd)) { vmd->vmd_minset = 1; DOMAINSET_SET(vmd->vmd_domain, &vm_min_domains); } if (!vmd->vmd_severeset && vm_paging_severe(vmd)) { vmd->vmd_severeset = 1; DOMAINSET_SET(vmd->vmd_domain, &vm_severe_domains); } mtx_unlock(&vm_domainset_lock); } /* * Clear the domain from the appropriate page level domainset. */ void vm_domain_clear(struct vm_domain *vmd) { mtx_lock(&vm_domainset_lock); if (vmd->vmd_minset && !vm_paging_min(vmd)) { vmd->vmd_minset = 0; DOMAINSET_CLR(vmd->vmd_domain, &vm_min_domains); if (vm_min_waiters != 0) { vm_min_waiters = 0; wakeup(&vm_min_domains); } } if (vmd->vmd_severeset && !vm_paging_severe(vmd)) { vmd->vmd_severeset = 0; DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains); if (vm_severe_waiters != 0) { vm_severe_waiters = 0; wakeup(&vm_severe_domains); } } /* * If pageout daemon needs pages, then tell it that there are * some free. */ if (vmd->vmd_pageout_pages_needed && vmd->vmd_free_count >= vmd->vmd_pageout_free_min) { wakeup(&vmd->vmd_pageout_pages_needed); vmd->vmd_pageout_pages_needed = 0; } /* See comments in vm_wait_doms(). */ if (vm_pageproc_waiters) { vm_pageproc_waiters = 0; wakeup(&vm_pageproc_waiters); } mtx_unlock(&vm_domainset_lock); } /* * Wait for free pages to exceed the min threshold globally. */ void vm_wait_min(void) { mtx_lock(&vm_domainset_lock); while (vm_page_count_min()) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0); } mtx_unlock(&vm_domainset_lock); } /* * Wait for free pages to exceed the severe threshold globally. */ void vm_wait_severe(void) { mtx_lock(&vm_domainset_lock); while (vm_page_count_severe()) { vm_severe_waiters++; msleep(&vm_severe_domains, &vm_domainset_lock, PVM, "vmwait", 0); } mtx_unlock(&vm_domainset_lock); } u_int vm_wait_count(void) { return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters); } int vm_wait_doms(const domainset_t *wdoms, int mflags) { int error; error = 0; /* * We use racey wakeup synchronization to avoid expensive global * locking for the pageproc when sleeping with a non-specific vm_wait. * To handle this, we only sleep for one tick in this instance. It * is expected that most allocations for the pageproc will come from * kmem or vm_page_grab* which will use the more specific and * race-free vm_wait_domain(). */ if (curproc == pageproc) { mtx_lock(&vm_domainset_lock); vm_pageproc_waiters++; error = msleep(&vm_pageproc_waiters, &vm_domainset_lock, PVM | PDROP | mflags, "pageprocwait", 1); } else { /* * XXX Ideally we would wait only until the allocation could * be satisfied. This condition can cause new allocators to * consume all freed pages while old allocators wait. */ mtx_lock(&vm_domainset_lock); if (vm_page_count_min_set(wdoms)) { vm_min_waiters++; error = msleep(&vm_min_domains, &vm_domainset_lock, PVM | PDROP | mflags, "vmwait", 0); } else mtx_unlock(&vm_domainset_lock); } return (error); } /* * vm_wait_domain: * * Sleep until free pages are available for allocation. * - Called in various places after failed memory allocations. */ void vm_wait_domain(int domain) { struct vm_domain *vmd; domainset_t wdom; vmd = VM_DOMAIN(domain); vm_domain_free_assert_unlocked(vmd); if (curproc == pageproc) { mtx_lock(&vm_domainset_lock); if (vmd->vmd_free_count < vmd->vmd_pageout_free_min) { vmd->vmd_pageout_pages_needed = 1; msleep(&vmd->vmd_pageout_pages_needed, &vm_domainset_lock, PDROP | PSWP, "VMWait", 0); } else mtx_unlock(&vm_domainset_lock); } else { if (pageproc == NULL) panic("vm_wait in early boot"); DOMAINSET_ZERO(&wdom); DOMAINSET_SET(vmd->vmd_domain, &wdom); vm_wait_doms(&wdom, 0); } } static int vm_wait_flags(vm_object_t obj, int mflags) { struct domainset *d; d = NULL; /* * Carefully fetch pointers only once: the struct domainset * itself is ummutable but the pointer might change. */ if (obj != NULL) d = obj->domain.dr_policy; if (d == NULL) d = curthread->td_domain.dr_policy; return (vm_wait_doms(&d->ds_mask, mflags)); } /* * vm_wait: * * Sleep until free pages are available for allocation in the * affinity domains of the obj. If obj is NULL, the domain set * for the calling thread is used. * Called in various places after failed memory allocations. */ void vm_wait(vm_object_t obj) { (void)vm_wait_flags(obj, 0); } int vm_wait_intr(vm_object_t obj) { return (vm_wait_flags(obj, PCATCH)); } /* * vm_domain_alloc_fail: * * Called when a page allocation function fails. Informs the * pagedaemon and performs the requested wait. Requires the * domain_free and object lock on entry. Returns with the * object lock held and free lock released. Returns an error when * retry is necessary. * */ static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req) { vm_domain_free_assert_unlocked(vmd); atomic_add_int(&vmd->vmd_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) { if (object != NULL) VM_OBJECT_WUNLOCK(object); vm_wait_domain(vmd->vmd_domain); if (object != NULL) VM_OBJECT_WLOCK(object); if (req & VM_ALLOC_WAITOK) return (EAGAIN); } return (0); } /* * vm_waitpfault: * * Sleep until free pages are available for allocation. * - Called only in vm_fault so that processes page faulting * can be easily tracked. * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing * processes will be able to grab memory first. Do not change * this balance without careful testing first. */ void vm_waitpfault(struct domainset *dset, int timo) { /* * XXX Ideally we would wait only until the allocation could * be satisfied. This condition can cause new allocators to * consume all freed pages while old allocators wait. */ mtx_lock(&vm_domainset_lock); if (vm_page_count_min_set(&dset->ds_mask)) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PUSER | PDROP, "pfault", timo); } else mtx_unlock(&vm_domainset_lock); } static struct vm_pagequeue * _vm_page_pagequeue(vm_page_t m, uint8_t queue) { return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]); } #ifdef INVARIANTS static struct vm_pagequeue * vm_page_pagequeue(vm_page_t m) { return (_vm_page_pagequeue(m, vm_page_astate_load(m).queue)); } #endif static __always_inline bool vm_page_pqstate_fcmpset(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { vm_page_astate_t tmp; tmp = *old; do { if (__predict_true(vm_page_astate_fcmpset(m, old, new))) return (true); counter_u64_add(pqstate_commit_retries, 1); } while (old->_bits == tmp._bits); return (false); } /* * Do the work of committing a queue state update that moves the page out of * its current queue. */ static bool _vm_page_pqstate_commit_dequeue(struct vm_pagequeue *pq, vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { vm_page_t next; vm_pagequeue_assert_locked(pq); KASSERT(vm_page_pagequeue(m) == pq, ("%s: queue %p does not match page %p", __func__, pq, m)); KASSERT(old->queue != PQ_NONE && new.queue != old->queue, ("%s: invalid queue indices %d %d", __func__, old->queue, new.queue)); /* * Once the queue index of the page changes there is nothing * synchronizing with further updates to the page's physical * queue state. Therefore we must speculatively remove the page * from the queue now and be prepared to roll back if the queue * state update fails. If the page is not physically enqueued then * we just update its queue index. */ if ((old->flags & PGA_ENQUEUED) != 0) { new.flags &= ~PGA_ENQUEUED; next = TAILQ_NEXT(m, plinks.q); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); if (!vm_page_pqstate_fcmpset(m, old, new)) { if (next == NULL) TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); else TAILQ_INSERT_BEFORE(next, m, plinks.q); vm_pagequeue_cnt_inc(pq); return (false); } else { return (true); } } else { return (vm_page_pqstate_fcmpset(m, old, new)); } } static bool vm_page_pqstate_commit_dequeue(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { struct vm_pagequeue *pq; vm_page_astate_t as; bool ret; pq = _vm_page_pagequeue(m, old->queue); /* * The queue field and PGA_ENQUEUED flag are stable only so long as the * corresponding page queue lock is held. */ vm_pagequeue_lock(pq); as = vm_page_astate_load(m); if (__predict_false(as._bits != old->_bits)) { *old = as; ret = false; } else { ret = _vm_page_pqstate_commit_dequeue(pq, m, old, new); } vm_pagequeue_unlock(pq); return (ret); } /* * Commit a queue state update that enqueues or requeues a page. */ static bool _vm_page_pqstate_commit_requeue(struct vm_pagequeue *pq, vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { struct vm_domain *vmd; vm_pagequeue_assert_locked(pq); KASSERT(old->queue != PQ_NONE && new.queue == old->queue, ("%s: invalid queue indices %d %d", __func__, old->queue, new.queue)); new.flags |= PGA_ENQUEUED; if (!vm_page_pqstate_fcmpset(m, old, new)) return (false); if ((old->flags & PGA_ENQUEUED) != 0) TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); else vm_pagequeue_cnt_inc(pq); /* * Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE. In particular, if * both flags are set in close succession, only PGA_REQUEUE_HEAD will be * applied, even if it was set first. */ if ((old->flags & PGA_REQUEUE_HEAD) != 0) { vmd = vm_pagequeue_domain(m); KASSERT(pq == &vmd->vmd_pagequeues[PQ_INACTIVE], ("%s: invalid page queue for page %p", __func__, m)); TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q); } else { TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); } return (true); } /* * Commit a queue state update that encodes a request for a deferred queue * operation. */ static bool vm_page_pqstate_commit_request(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { KASSERT(old->queue == new.queue || new.queue != PQ_NONE, ("%s: invalid state, queue %d flags %x", __func__, new.queue, new.flags)); if (old->_bits != new._bits && !vm_page_pqstate_fcmpset(m, old, new)) return (false); vm_page_pqbatch_submit(m, new.queue); return (true); } /* * A generic queue state update function. This handles more cases than the * specialized functions above. */ bool vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { if (old->_bits == new._bits) return (true); if (old->queue != PQ_NONE && new.queue != old->queue) { if (!vm_page_pqstate_commit_dequeue(m, old, new)) return (false); if (new.queue != PQ_NONE) vm_page_pqbatch_submit(m, new.queue); } else { if (!vm_page_pqstate_fcmpset(m, old, new)) return (false); if (new.queue != PQ_NONE && ((new.flags & ~old->flags) & PGA_QUEUE_OP_MASK) != 0) vm_page_pqbatch_submit(m, new.queue); } return (true); } /* * Apply deferred queue state updates to a page. */ static inline void vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m, uint8_t queue) { vm_page_astate_t new, old; CRITICAL_ASSERT(curthread); vm_pagequeue_assert_locked(pq); KASSERT(queue < PQ_COUNT, ("%s: invalid queue index %d", __func__, queue)); KASSERT(pq == _vm_page_pagequeue(m, queue), ("%s: page %p does not belong to queue %p", __func__, m, pq)); for (old = vm_page_astate_load(m);;) { if (__predict_false(old.queue != queue || (old.flags & PGA_QUEUE_OP_MASK) == 0)) { counter_u64_add(queue_nops, 1); break; } KASSERT(old.queue != PQ_NONE || (old.flags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p has unexpected queue state", __func__, m)); new = old; if ((old.flags & PGA_DEQUEUE) != 0) { new.flags &= ~PGA_QUEUE_OP_MASK; new.queue = PQ_NONE; if (__predict_true(_vm_page_pqstate_commit_dequeue(pq, m, &old, new))) { counter_u64_add(queue_ops, 1); break; } } else { new.flags &= ~(PGA_REQUEUE | PGA_REQUEUE_HEAD); if (__predict_true(_vm_page_pqstate_commit_requeue(pq, m, &old, new))) { counter_u64_add(queue_ops, 1); break; } } } } static void vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq, uint8_t queue) { int i; for (i = 0; i < bq->bq_cnt; i++) vm_pqbatch_process_page(pq, bq->bq_pa[i], queue); vm_batchqueue_init(bq); } /* * vm_page_pqbatch_submit: [ internal use only ] * * Enqueue a page in the specified page queue's batched work queue. * The caller must have encoded the requested operation in the page * structure's a.flags field. */ void vm_page_pqbatch_submit(vm_page_t m, uint8_t queue) { struct vm_batchqueue *bq; struct vm_pagequeue *pq; int domain; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("page %p is unmanaged", m)); KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue)); domain = vm_phys_domain(m); pq = &vm_pagequeue_domain(m)->vmd_pagequeues[queue]; critical_enter(); bq = DPCPU_PTR(pqbatch[domain][queue]); if (vm_batchqueue_insert(bq, m)) { critical_exit(); return; } critical_exit(); vm_pagequeue_lock(pq); critical_enter(); bq = DPCPU_PTR(pqbatch[domain][queue]); vm_pqbatch_process(pq, bq, queue); vm_pqbatch_process_page(pq, m, queue); vm_pagequeue_unlock(pq); critical_exit(); } /* * vm_page_pqbatch_drain: [ internal use only ] * * Force all per-CPU page queue batch queues to be drained. This is * intended for use in severe memory shortages, to ensure that pages * do not remain stuck in the batch queues. */ void vm_page_pqbatch_drain(void) { struct thread *td; struct vm_domain *vmd; struct vm_pagequeue *pq; int cpu, domain, queue; td = curthread; CPU_FOREACH(cpu) { thread_lock(td); sched_bind(td, cpu); thread_unlock(td); for (domain = 0; domain < vm_ndomains; domain++) { vmd = VM_DOMAIN(domain); for (queue = 0; queue < PQ_COUNT; queue++) { pq = &vmd->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); critical_enter(); vm_pqbatch_process(pq, DPCPU_PTR(pqbatch[domain][queue]), queue); critical_exit(); vm_pagequeue_unlock(pq); } } } thread_lock(td); sched_unbind(td); thread_unlock(td); } /* * vm_page_dequeue_deferred: [ internal use only ] * * Request removal of the given page from its current page * queue. Physical removal from the queue may be deferred * indefinitely. */ void vm_page_dequeue_deferred(vm_page_t m) { vm_page_astate_t new, old; old = vm_page_astate_load(m); do { if (old.queue == PQ_NONE) { KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p has unexpected queue state", __func__, m)); break; } new = old; new.flags |= PGA_DEQUEUE; } while (!vm_page_pqstate_commit_request(m, &old, new)); } /* * vm_page_dequeue: * * Remove the page from whichever page queue it's in, if any, before * returning. */ void vm_page_dequeue(vm_page_t m) { vm_page_astate_t new, old; old = vm_page_astate_load(m); do { if (old.queue == PQ_NONE) { KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p has unexpected queue state", __func__, m)); break; } new = old; new.flags &= ~PGA_QUEUE_OP_MASK; new.queue = PQ_NONE; } while (!vm_page_pqstate_commit_dequeue(m, &old, new)); } /* * Schedule the given page for insertion into the specified page queue. * Physical insertion of the page may be deferred indefinitely. */ static void vm_page_enqueue(vm_page_t m, uint8_t queue) { KASSERT(m->a.queue == PQ_NONE && (m->a.flags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p is already enqueued", __func__, m)); KASSERT(m->ref_count > 0, ("%s: page %p does not carry any references", __func__, m)); m->a.queue = queue; if ((m->a.flags & PGA_REQUEUE) == 0) vm_page_aflag_set(m, PGA_REQUEUE); vm_page_pqbatch_submit(m, queue); } /* * vm_page_free_prep: * * Prepares the given page to be put on the free list, * disassociating it from any VM object. The caller may return * the page to the free list only if this function returns true. * * The object, if it exists, must be locked, and then the page must * be xbusy. Otherwise the page must be not busied. A managed * page must be unmapped. */ static bool vm_page_free_prep(vm_page_t m) { /* * Synchronize with threads that have dropped a reference to this * page. */ atomic_thread_fence_acq(); #if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP) if (PMAP_HAS_DMAP && (m->flags & PG_ZERO) != 0) { uint64_t *p; int i; p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++) KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx", m, i, (uintmax_t)*p)); } #endif if ((m->oflags & VPO_UNMANAGED) == 0) { KASSERT(!pmap_page_is_mapped(m), ("vm_page_free_prep: freeing mapped page %p", m)); KASSERT((m->a.flags & (PGA_EXECUTABLE | PGA_WRITEABLE)) == 0, ("vm_page_free_prep: mapping flags set in page %p", m)); } else { KASSERT(m->a.queue == PQ_NONE, ("vm_page_free_prep: unmanaged page %p is queued", m)); } VM_CNT_INC(v_tfree); if (m->object != NULL) { KASSERT(((m->oflags & VPO_UNMANAGED) != 0) == ((m->object->flags & OBJ_UNMANAGED) != 0), ("vm_page_free_prep: managed flag mismatch for page %p", m)); vm_page_assert_xbusied(m); /* * The object reference can be released without an atomic * operation. */ KASSERT((m->flags & PG_FICTITIOUS) != 0 || m->ref_count == VPRC_OBJREF, ("vm_page_free_prep: page %p has unexpected ref_count %u", m, m->ref_count)); vm_page_object_remove(m); m->ref_count -= VPRC_OBJREF; } else vm_page_assert_unbusied(m); vm_page_busy_free(m); /* * If fictitious remove object association and * return. */ if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->ref_count == 1, ("fictitious page %p is referenced", m)); KASSERT(m->a.queue == PQ_NONE, ("fictitious page %p is queued", m)); return (false); } /* * Pages need not be dequeued before they are returned to the physical * memory allocator, but they must at least be marked for a deferred * dequeue. */ if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_dequeue_deferred(m); m->valid = 0; vm_page_undirty(m); if (m->ref_count != 0) panic("vm_page_free_prep: page %p has references", m); /* * Restore the default memory attribute to the page. */ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); #if VM_NRESERVLEVEL > 0 /* * Determine whether the page belongs to a reservation. If the page was * allocated from a per-CPU cache, it cannot belong to a reservation, so * as an optimization, we avoid the check in that case. */ if ((m->flags & PG_PCPU_CACHE) == 0 && vm_reserv_free_page(m)) return (false); #endif return (true); } /* * vm_page_free_toq: * * Returns the given page to the free list, disassociating it * from any VM object. * * The object must be locked. The page must be exclusively busied if it * belongs to an object. */ static void vm_page_free_toq(vm_page_t m) { struct vm_domain *vmd; uma_zone_t zone; if (!vm_page_free_prep(m)) return; vmd = vm_pagequeue_domain(m); zone = vmd->vmd_pgcache[m->pool].zone; if ((m->flags & PG_PCPU_CACHE) != 0 && zone != NULL) { uma_zfree(zone, m); return; } vm_domain_free_lock(vmd); vm_phys_free_pages(m, 0); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, 1); } /* * vm_page_free_pages_toq: * * Returns a list of pages to the free list, disassociating it * from any VM object. In other words, this is equivalent to * calling vm_page_free_toq() for each page of a list of VM objects. */ void vm_page_free_pages_toq(struct spglist *free, bool update_wire_count) { vm_page_t m; int count; if (SLIST_EMPTY(free)) return; count = 0; while ((m = SLIST_FIRST(free)) != NULL) { count++; SLIST_REMOVE_HEAD(free, plinks.s.ss); vm_page_free_toq(m); } if (update_wire_count) vm_wire_sub(count); } /* * Mark this page as wired down. For managed pages, this prevents reclamation * by the page daemon, or when the containing object, if any, is destroyed. */ void vm_page_wire(vm_page_t m) { u_int old; #ifdef INVARIANTS if (m->object != NULL && !vm_page_busied(m) && !vm_object_busied(m->object)) VM_OBJECT_ASSERT_LOCKED(m->object); #endif KASSERT((m->flags & PG_FICTITIOUS) == 0 || VPRC_WIRE_COUNT(m->ref_count) >= 1, ("vm_page_wire: fictitious page %p has zero wirings", m)); old = atomic_fetchadd_int(&m->ref_count, 1); KASSERT(VPRC_WIRE_COUNT(old) != VPRC_WIRE_COUNT_MAX, ("vm_page_wire: counter overflow for page %p", m)); if (VPRC_WIRE_COUNT(old) == 0) { if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_DEQUEUE); vm_wire_add(1); } } /* * Attempt to wire a mapped page following a pmap lookup of that page. * This may fail if a thread is concurrently tearing down mappings of the page. * The transient failure is acceptable because it translates to the * failure of the caller pmap_extract_and_hold(), which should be then * followed by the vm_fault() fallback, see e.g. vm_fault_quick_hold_pages(). */ bool vm_page_wire_mapped(vm_page_t m) { u_int old; old = m->ref_count; do { KASSERT(old > 0, ("vm_page_wire_mapped: wiring unreferenced page %p", m)); if ((old & VPRC_BLOCKED) != 0) return (false); } while (!atomic_fcmpset_int(&m->ref_count, &old, old + 1)); if (VPRC_WIRE_COUNT(old) == 0) { if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_DEQUEUE); vm_wire_add(1); } return (true); } /* * Release a wiring reference to a managed page. If the page still belongs to * an object, update its position in the page queues to reflect the reference. * If the wiring was the last reference to the page, free the page. */ static void vm_page_unwire_managed(vm_page_t m, uint8_t nqueue, bool noreuse) { u_int old; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is unmanaged", __func__, m)); /* * Update LRU state before releasing the wiring reference. * Use a release store when updating the reference count to * synchronize with vm_page_free_prep(). */ old = m->ref_count; do { KASSERT(VPRC_WIRE_COUNT(old) > 0, ("vm_page_unwire: wire count underflow for page %p", m)); if (old > VPRC_OBJREF + 1) { /* * The page has at least one other wiring reference. An * earlier iteration of this loop may have called * vm_page_release_toq() and cleared PGA_DEQUEUE, so * re-set it if necessary. */ if ((vm_page_astate_load(m).flags & PGA_DEQUEUE) == 0) vm_page_aflag_set(m, PGA_DEQUEUE); } else if (old == VPRC_OBJREF + 1) { /* * This is the last wiring. Clear PGA_DEQUEUE and * update the page's queue state to reflect the * reference. If the page does not belong to an object * (i.e., the VPRC_OBJREF bit is clear), we only need to * clear leftover queue state. */ vm_page_release_toq(m, nqueue, false); } else if (old == 1) { vm_page_aflag_clear(m, PGA_DEQUEUE); } } while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1)); if (VPRC_WIRE_COUNT(old) == 1) { vm_wire_sub(1); if (old == 1) vm_page_free(m); } } /* * Release one wiring of the specified page, potentially allowing it to be * paged out. * * Only managed pages belonging to an object can be paged out. If the number * of wirings transitions to zero and the page is eligible for page out, then * the page is added to the specified paging queue. If the released wiring * represented the last reference to the page, the page is freed. */ void vm_page_unwire(vm_page_t m, uint8_t nqueue) { KASSERT(nqueue < PQ_COUNT, ("vm_page_unwire: invalid queue %u request for page %p", nqueue, m)); if ((m->oflags & VPO_UNMANAGED) != 0) { if (vm_page_unwire_noq(m) && m->ref_count == 0) vm_page_free(m); return; } vm_page_unwire_managed(m, nqueue, false); } /* * Unwire a page without (re-)inserting it into a page queue. It is up * to the caller to enqueue, requeue, or free the page as appropriate. * In most cases involving managed pages, vm_page_unwire() should be used * instead. */ bool vm_page_unwire_noq(vm_page_t m) { u_int old; old = vm_page_drop(m, 1); KASSERT(VPRC_WIRE_COUNT(old) != 0, ("vm_page_unref: counter underflow for page %p", m)); KASSERT((m->flags & PG_FICTITIOUS) == 0 || VPRC_WIRE_COUNT(old) > 1, ("vm_page_unref: missing ref on fictitious page %p", m)); if (VPRC_WIRE_COUNT(old) > 1) return (false); if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_clear(m, PGA_DEQUEUE); vm_wire_sub(1); return (true); } /* * Ensure that the page ends up in the specified page queue. If the page is * active or being moved to the active queue, ensure that its act_count is * at least ACT_INIT but do not otherwise mess with it. */ static __always_inline void vm_page_mvqueue(vm_page_t m, const uint8_t nqueue, const uint16_t nflag) { vm_page_astate_t old, new; KASSERT(m->ref_count > 0, ("%s: page %p does not carry any references", __func__, m)); KASSERT(nflag == PGA_REQUEUE || nflag == PGA_REQUEUE_HEAD, ("%s: invalid flags %x", __func__, nflag)); if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m)) return; old = vm_page_astate_load(m); do { if ((old.flags & PGA_DEQUEUE) != 0) break; new = old; new.flags &= ~PGA_QUEUE_OP_MASK; if (nqueue == PQ_ACTIVE) new.act_count = max(old.act_count, ACT_INIT); if (old.queue == nqueue) { if (nqueue != PQ_ACTIVE) new.flags |= nflag; } else { new.flags |= nflag; new.queue = nqueue; } } while (!vm_page_pqstate_commit(m, &old, new)); } /* * Put the specified page on the active list (if appropriate). */ void vm_page_activate(vm_page_t m) { vm_page_mvqueue(m, PQ_ACTIVE, PGA_REQUEUE); } /* * Move the specified page to the tail of the inactive queue, or requeue * the page if it is already in the inactive queue. */ void vm_page_deactivate(vm_page_t m) { vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE); } void vm_page_deactivate_noreuse(vm_page_t m) { vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE_HEAD); } /* * Put a page in the laundry, or requeue it if it is already there. */ void vm_page_launder(vm_page_t m) { vm_page_mvqueue(m, PQ_LAUNDRY, PGA_REQUEUE); } /* * Put a page in the PQ_UNSWAPPABLE holding queue. */ void vm_page_unswappable(vm_page_t m) { KASSERT(!vm_page_wired(m) && (m->oflags & VPO_UNMANAGED) == 0, ("page %p already unswappable", m)); vm_page_dequeue(m); vm_page_enqueue(m, PQ_UNSWAPPABLE); } /* * Release a page back to the page queues in preparation for unwiring. */ static void vm_page_release_toq(vm_page_t m, uint8_t nqueue, const bool noreuse) { vm_page_astate_t old, new; uint16_t nflag; /* * Use a check of the valid bits to determine whether we should * accelerate reclamation of the page. The object lock might not be * held here, in which case the check is racy. At worst we will either * accelerate reclamation of a valid page and violate LRU, or * unnecessarily defer reclamation of an invalid page. * * If we were asked to not cache the page, place it near the head of the * inactive queue so that is reclaimed sooner. */ if (noreuse || m->valid == 0) { nqueue = PQ_INACTIVE; nflag = PGA_REQUEUE_HEAD; } else { nflag = PGA_REQUEUE; } old = vm_page_astate_load(m); do { new = old; /* * If the page is already in the active queue and we are not * trying to accelerate reclamation, simply mark it as * referenced and avoid any queue operations. */ new.flags &= ~PGA_QUEUE_OP_MASK; if (nflag != PGA_REQUEUE_HEAD && old.queue == PQ_ACTIVE) new.flags |= PGA_REFERENCED; else { new.flags |= nflag; new.queue = nqueue; } } while (!vm_page_pqstate_commit(m, &old, new)); } /* * Unwire a page and either attempt to free it or re-add it to the page queues. */ void vm_page_release(vm_page_t m, int flags) { vm_object_t object; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_release: page %p is unmanaged", m)); if ((flags & VPR_TRYFREE) != 0) { for (;;) { object = atomic_load_ptr(&m->object); if (object == NULL) break; /* Depends on type-stability. */ if (vm_page_busied(m) || !VM_OBJECT_TRYWLOCK(object)) break; if (object == m->object) { vm_page_release_locked(m, flags); VM_OBJECT_WUNLOCK(object); return; } VM_OBJECT_WUNLOCK(object); } } vm_page_unwire_managed(m, PQ_INACTIVE, flags != 0); } /* See vm_page_release(). */ void vm_page_release_locked(vm_page_t m, int flags) { VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_release_locked: page %p is unmanaged", m)); if (vm_page_unwire_noq(m)) { if ((flags & VPR_TRYFREE) != 0 && (m->object->ref_count == 0 || !pmap_page_is_mapped(m)) && m->dirty == 0 && vm_page_tryxbusy(m)) { /* * An unlocked lookup may have wired the page before the * busy lock was acquired, in which case the page must * not be freed. */ if (__predict_true(!vm_page_wired(m))) { vm_page_free(m); return; } vm_page_xunbusy(m); } else { vm_page_release_toq(m, PQ_INACTIVE, flags != 0); } } } static bool vm_page_try_blocked_op(vm_page_t m, void (*op)(vm_page_t)) { u_int old; KASSERT(m->object != NULL && (m->oflags & VPO_UNMANAGED) == 0, ("vm_page_try_blocked_op: page %p has no object", m)); KASSERT(vm_page_busied(m), ("vm_page_try_blocked_op: page %p is not busy", m)); VM_OBJECT_ASSERT_LOCKED(m->object); old = m->ref_count; do { KASSERT(old != 0, ("vm_page_try_blocked_op: page %p has no references", m)); if (VPRC_WIRE_COUNT(old) != 0) return (false); } while (!atomic_fcmpset_int(&m->ref_count, &old, old | VPRC_BLOCKED)); (op)(m); /* * If the object is read-locked, new wirings may be created via an * object lookup. */ old = vm_page_drop(m, VPRC_BLOCKED); KASSERT(!VM_OBJECT_WOWNED(m->object) || old == (VPRC_BLOCKED | VPRC_OBJREF), ("vm_page_try_blocked_op: unexpected refcount value %u for %p", old, m)); return (true); } /* * Atomically check for wirings and remove all mappings of the page. */ bool vm_page_try_remove_all(vm_page_t m) { return (vm_page_try_blocked_op(m, pmap_remove_all)); } /* * Atomically check for wirings and remove all writeable mappings of the page. */ bool vm_page_try_remove_write(vm_page_t m) { return (vm_page_try_blocked_op(m, pmap_remove_write)); } /* * vm_page_advise * * Apply the specified advice to the given page. */ void vm_page_advise(vm_page_t m, int advice) { VM_OBJECT_ASSERT_WLOCKED(m->object); vm_page_assert_xbusied(m); if (advice == MADV_FREE) /* * Mark the page clean. This will allow the page to be freed * without first paging it out. MADV_FREE pages are often * quickly reused by malloc(3), so we do not do anything that * would result in a page fault on a later access. */ vm_page_undirty(m); else if (advice != MADV_DONTNEED) { if (advice == MADV_WILLNEED) vm_page_activate(m); return; } if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); /* * Clear any references to the page. Otherwise, the page daemon will * immediately reactivate the page. */ vm_page_aflag_clear(m, PGA_REFERENCED); /* * Place clean pages near the head of the inactive queue rather than * the tail, thus defeating the queue's LRU operation and ensuring that * the page will be reused quickly. Dirty pages not already in the * laundry are moved there. */ if (m->dirty == 0) vm_page_deactivate_noreuse(m); else if (!vm_page_in_laundry(m)) vm_page_launder(m); } /* * vm_page_grab_release * * Helper routine for grab functions to release busy on return. */ static inline void vm_page_grab_release(vm_page_t m, int allocflags) { if ((allocflags & VM_ALLOC_NOBUSY) != 0) { if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0) vm_page_sunbusy(m); else vm_page_xunbusy(m); } } /* * vm_page_grab_sleep * * Sleep for busy according to VM_ALLOC_ parameters. Returns true * if the caller should retry and false otherwise. * * If the object is locked on entry the object will be unlocked with * false returns and still locked but possibly having been dropped * with true returns. */ static bool vm_page_grab_sleep(vm_object_t object, vm_page_t m, vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (false); /* * Reference the page before unlocking and sleeping so that * the page daemon is less likely to reclaim it. */ if (locked && (allocflags & VM_ALLOC_NOCREAT) == 0) vm_page_reference(m); if (_vm_page_busy_sleep(object, m, m->pindex, wmesg, allocflags, locked) && locked) VM_OBJECT_WLOCK(object); if ((allocflags & VM_ALLOC_WAITFAIL) != 0) return (false); return (true); } /* * Assert that the grab flags are valid. */ static inline void vm_page_grab_check(int allocflags) { KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 || (allocflags & VM_ALLOC_WIRED) != 0, ("vm_page_grab*: the pages must be busied or wired")); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab*: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); } /* * Calculate the page allocation flags for grab. */ static inline int vm_page_grab_pflags(int allocflags) { int pflags; pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL | VM_ALLOC_NOBUSY); if ((allocflags & VM_ALLOC_NOWAIT) == 0) pflags |= VM_ALLOC_WAITFAIL; if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0) pflags |= VM_ALLOC_SBUSY; return (pflags); } /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, first allocate it * and then conditionally zero it. * * This routine may sleep. * * The object must be locked on entry. The lock will, however, be released * and reacquired if the routine sleeps. */ vm_page_t vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(object); vm_page_grab_check(allocflags); retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { if (!vm_page_tryacquire(m, allocflags)) { if (vm_page_grab_sleep(object, m, pindex, "pgrbwt", allocflags, true)) goto retrylookup; return (NULL); } goto out; } if ((allocflags & VM_ALLOC_NOCREAT) != 0) return (NULL); m = vm_page_alloc(object, pindex, vm_page_grab_pflags(allocflags)); if (m == NULL) { if ((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0) return (NULL); goto retrylookup; } if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); out: vm_page_grab_release(m, allocflags); return (m); } /* * Locklessly attempt to acquire a page given a (object, pindex) tuple * and an optional previous page to avoid the radix lookup. The resulting * page will be validated against the identity tuple and busied or wired * as requested. A NULL *mp return guarantees that the page was not in * radix at the time of the call but callers must perform higher level * synchronization or retry the operation under a lock if they require * an atomic answer. This is the only lock free validation routine, * other routines can depend on the resulting page state. * * The return value indicates whether the operation failed due to caller * flags. The return is tri-state with mp: * * (true, *mp != NULL) - The operation was successful. * (true, *mp == NULL) - The page was not found in tree. * (false, *mp == NULL) - WAITFAIL or NOWAIT prevented acquisition. */ static bool vm_page_acquire_unlocked(vm_object_t object, vm_pindex_t pindex, vm_page_t prev, vm_page_t *mp, int allocflags) { vm_page_t m; vm_page_grab_check(allocflags); MPASS(prev == NULL || vm_page_busied(prev) || vm_page_wired(prev)); *mp = NULL; for (;;) { /* * We may see a false NULL here because the previous page * has been removed or just inserted and the list is loaded * without barriers. Switch to radix to verify. */ if (prev == NULL || (m = TAILQ_NEXT(prev, listq)) == NULL || QMD_IS_TRASHED(m) || m->pindex != pindex || atomic_load_ptr(&m->object) != object) { prev = NULL; /* * This guarantees the result is instantaneously * correct. */ m = vm_radix_lookup_unlocked(&object->rtree, pindex); } if (m == NULL) return (true); if (vm_page_trybusy(m, allocflags)) { if (m->object == object && m->pindex == pindex) break; /* relookup. */ vm_page_busy_release(m); cpu_spinwait(); continue; } if (!vm_page_grab_sleep(object, m, pindex, "pgnslp", allocflags, false)) return (false); } if ((allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); vm_page_grab_release(m, allocflags); *mp = m; return (true); } /* * Try to locklessly grab a page and fall back to the object lock if NOCREAT * is not set. */ vm_page_t vm_page_grab_unlocked(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; vm_page_grab_check(allocflags); if (!vm_page_acquire_unlocked(object, pindex, NULL, &m, allocflags)) return (NULL); if (m != NULL) return (m); /* * The radix lockless lookup should never return a false negative * errors. If the user specifies NOCREAT they are guaranteed there * was no page present at the instant of the call. A NOCREAT caller * must handle create races gracefully. */ if ((allocflags & VM_ALLOC_NOCREAT) != 0) return (NULL); VM_OBJECT_WLOCK(object); m = vm_page_grab(object, pindex, allocflags); VM_OBJECT_WUNLOCK(object); return (m); } /* * Grab a page and make it valid, paging in if necessary. Pages missing from * their pager are zero filled and validated. If a VM_ALLOC_COUNT is supplied * and the page is not valid as many as VM_INITIAL_PAGEIN pages can be brought * in simultaneously. Additional pages will be left on a paging queue but * will neither be wired nor busy regardless of allocflags. */ int vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; vm_page_t ma[VM_INITIAL_PAGEIN]; int after, i, pflags, rv; KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab_valid: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); KASSERT((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0, ("vm_page_grab_valid: Invalid flags 0x%X", allocflags)); VM_OBJECT_ASSERT_WLOCKED(object); pflags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY | VM_ALLOC_WIRED); pflags |= VM_ALLOC_WAITFAIL; retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { /* * If the page is fully valid it can only become invalid * with the object lock held. If it is not valid it can * become valid with the busy lock held. Therefore, we * may unnecessarily lock the exclusive busy here if we * race with I/O completion not using the object lock. * However, we will not end up with an invalid page and a * shared lock. */ if (!vm_page_trybusy(m, vm_page_all_valid(m) ? allocflags : 0)) { (void)vm_page_grab_sleep(object, m, pindex, "pgrbwt", allocflags, true); goto retrylookup; } if (vm_page_all_valid(m)) goto out; if ((allocflags & VM_ALLOC_NOCREAT) != 0) { vm_page_busy_release(m); *mp = NULL; return (VM_PAGER_FAIL); } } else if ((allocflags & VM_ALLOC_NOCREAT) != 0) { *mp = NULL; return (VM_PAGER_FAIL); } else if ((m = vm_page_alloc(object, pindex, pflags)) == NULL) { goto retrylookup; } vm_page_assert_xbusied(m); if (vm_pager_has_page(object, pindex, NULL, &after)) { after = MIN(after, VM_INITIAL_PAGEIN); after = MIN(after, allocflags >> VM_ALLOC_COUNT_SHIFT); after = MAX(after, 1); ma[0] = m; for (i = 1; i < after; i++) { if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) { if (ma[i]->valid || !vm_page_tryxbusy(ma[i])) break; } else { ma[i] = vm_page_alloc(object, m->pindex + i, VM_ALLOC_NORMAL); if (ma[i] == NULL) break; } } after = i; vm_object_pip_add(object, after); VM_OBJECT_WUNLOCK(object); rv = vm_pager_get_pages(object, ma, after, NULL, NULL); VM_OBJECT_WLOCK(object); vm_object_pip_wakeupn(object, after); /* Pager may have replaced a page. */ m = ma[0]; if (rv != VM_PAGER_OK) { for (i = 0; i < after; i++) { if (!vm_page_wired(ma[i])) vm_page_free(ma[i]); else vm_page_xunbusy(ma[i]); } *mp = NULL; return (rv); } for (i = 1; i < after; i++) vm_page_readahead_finish(ma[i]); MPASS(vm_page_all_valid(m)); } else { vm_page_zero_invalid(m, TRUE); } out: if ((allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); if ((allocflags & VM_ALLOC_SBUSY) != 0 && vm_page_xbusied(m)) vm_page_busy_downgrade(m); else if ((allocflags & VM_ALLOC_NOBUSY) != 0) vm_page_busy_release(m); *mp = m; return (VM_PAGER_OK); } /* * Locklessly grab a valid page. If the page is not valid or not yet * allocated this will fall back to the object lock method. */ int vm_page_grab_valid_unlocked(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; int flags; int error; KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab_valid_unlocked: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY " "mismatch")); KASSERT((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0, ("vm_page_grab_valid_unlocked: Invalid flags 0x%X", allocflags)); /* * Attempt a lockless lookup and busy. We need at least an sbusy * before we can inspect the valid field and return a wired page. */ flags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_WIRED); if (!vm_page_acquire_unlocked(object, pindex, NULL, mp, flags)) return (VM_PAGER_FAIL); if ((m = *mp) != NULL) { if (vm_page_all_valid(m)) { if ((allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); vm_page_grab_release(m, allocflags); return (VM_PAGER_OK); } vm_page_busy_release(m); } if ((allocflags & VM_ALLOC_NOCREAT) != 0) { *mp = NULL; return (VM_PAGER_FAIL); } VM_OBJECT_WLOCK(object); error = vm_page_grab_valid(mp, object, pindex, allocflags); VM_OBJECT_WUNLOCK(object); return (error); } /* * Return the specified range of pages from the given object. For each * page offset within the range, if a page already exists within the object * at that offset and it is busy, then wait for it to change state. If, * instead, the page doesn't exist, then allocate it. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs the pages * * The caller must always specify that the pages are to be busied and/or * wired. * * optional allocation flags: * VM_ALLOC_IGN_SBUSY do not sleep on soft busy pages * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NOWAIT do not sleep * VM_ALLOC_SBUSY set page to sbusy state * VM_ALLOC_WIRED wire the pages * VM_ALLOC_ZERO zero and validate any invalid pages * * If VM_ALLOC_NOWAIT is not specified, this routine may sleep. Otherwise, it * may return a partial prefix of the requested range. */ int vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags, vm_page_t *ma, int count) { vm_page_t m, mpred; int pflags; int i; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0, ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed")); KASSERT(count > 0, ("vm_page_grab_pages: invalid page count %d", count)); vm_page_grab_check(allocflags); pflags = vm_page_grab_pflags(allocflags); i = 0; retrylookup: m = vm_radix_lookup_le(&object->rtree, pindex + i); if (m == NULL || m->pindex != pindex + i) { mpred = m; m = NULL; } else mpred = TAILQ_PREV(m, pglist, listq); for (; i < count; i++) { if (m != NULL) { if (!vm_page_tryacquire(m, allocflags)) { if (vm_page_grab_sleep(object, m, pindex, "grbmaw", allocflags, true)) goto retrylookup; break; } } else { if ((allocflags & VM_ALLOC_NOCREAT) != 0) break; m = vm_page_alloc_after(object, pindex + i, pflags | VM_ALLOC_COUNT(count - i), mpred); if (m == NULL) { if ((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0) break; goto retrylookup; } } if (vm_page_none_valid(m) && (allocflags & VM_ALLOC_ZERO) != 0) { if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); vm_page_valid(m); } vm_page_grab_release(m, allocflags); ma[i] = mpred = m; m = vm_page_next(m); } return (i); } /* * Unlocked variant of vm_page_grab_pages(). This accepts the same flags * and will fall back to the locked variant to handle allocation. */ int vm_page_grab_pages_unlocked(vm_object_t object, vm_pindex_t pindex, int allocflags, vm_page_t *ma, int count) { vm_page_t m, pred; int flags; int i; KASSERT(count > 0, ("vm_page_grab_pages_unlocked: invalid page count %d", count)); vm_page_grab_check(allocflags); /* * Modify flags for lockless acquire to hold the page until we * set it valid if necessary. */ flags = allocflags & ~VM_ALLOC_NOBUSY; pred = NULL; for (i = 0; i < count; i++, pindex++) { if (!vm_page_acquire_unlocked(object, pindex, pred, &m, flags)) return (i); if (m == NULL) break; if ((flags & VM_ALLOC_ZERO) != 0 && vm_page_none_valid(m)) { if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); vm_page_valid(m); } /* m will still be wired or busy according to flags. */ vm_page_grab_release(m, allocflags); pred = ma[i] = m; } if (i == count || (allocflags & VM_ALLOC_NOCREAT) != 0) return (i); count -= i; VM_OBJECT_WLOCK(object); i += vm_page_grab_pages(object, pindex, allocflags, &ma[i], count); VM_OBJECT_WUNLOCK(object); return (i); } /* * Mapping function for valid or dirty bits in a page. * * Inputs are required to range within a page. */ vm_page_bits_t vm_page_bits(int base, int size) { int first_bit; int last_bit; KASSERT( base + size <= PAGE_SIZE, ("vm_page_bits: illegal base/size %d/%d", base, size) ); if (size == 0) /* handle degenerate case */ return (0); first_bit = base >> DEV_BSHIFT; last_bit = (base + size - 1) >> DEV_BSHIFT; return (((vm_page_bits_t)2 << last_bit) - ((vm_page_bits_t)1 << first_bit)); } void vm_page_bits_set(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t set) { #if PAGE_SIZE == 32768 atomic_set_64((uint64_t *)bits, set); #elif PAGE_SIZE == 16384 atomic_set_32((uint32_t *)bits, set); #elif (PAGE_SIZE == 8192) && defined(atomic_set_16) atomic_set_16((uint16_t *)bits, set); #elif (PAGE_SIZE == 4096) && defined(atomic_set_8) atomic_set_8((uint8_t *)bits, set); #else /* PAGE_SIZE <= 8192 */ uintptr_t addr; int shift; addr = (uintptr_t)bits; /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_{set, clear}_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_set_32((uint32_t *)addr, set << shift); #endif /* PAGE_SIZE */ } static inline void vm_page_bits_clear(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t clear) { #if PAGE_SIZE == 32768 atomic_clear_64((uint64_t *)bits, clear); #elif PAGE_SIZE == 16384 atomic_clear_32((uint32_t *)bits, clear); #elif (PAGE_SIZE == 8192) && defined(atomic_clear_16) atomic_clear_16((uint16_t *)bits, clear); #elif (PAGE_SIZE == 4096) && defined(atomic_clear_8) atomic_clear_8((uint8_t *)bits, clear); #else /* PAGE_SIZE <= 8192 */ uintptr_t addr; int shift; addr = (uintptr_t)bits; /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_{set, clear}_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_clear_32((uint32_t *)addr, clear << shift); #endif /* PAGE_SIZE */ } static inline vm_page_bits_t vm_page_bits_swap(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t newbits) { #if PAGE_SIZE == 32768 uint64_t old; old = *bits; while (atomic_fcmpset_64(bits, &old, newbits) == 0); return (old); #elif PAGE_SIZE == 16384 uint32_t old; old = *bits; while (atomic_fcmpset_32(bits, &old, newbits) == 0); return (old); #elif (PAGE_SIZE == 8192) && defined(atomic_fcmpset_16) uint16_t old; old = *bits; while (atomic_fcmpset_16(bits, &old, newbits) == 0); return (old); #elif (PAGE_SIZE == 4096) && defined(atomic_fcmpset_8) uint8_t old; old = *bits; while (atomic_fcmpset_8(bits, &old, newbits) == 0); return (old); #else /* PAGE_SIZE <= 4096*/ uintptr_t addr; uint32_t old, new, mask; int shift; addr = (uintptr_t)bits; /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_{set, swap, clear}_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); mask = VM_PAGE_BITS_ALL << shift; old = *bits; do { new = old & ~mask; new |= newbits << shift; } while (atomic_fcmpset_32((uint32_t *)addr, &old, new) == 0); return (old >> shift); #endif /* PAGE_SIZE */ } /* * vm_page_set_valid_range: * * Sets portions of a page valid. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zeroed. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_valid_range(vm_page_t m, int base, int size) { int endoff, frag; vm_page_bits_t pagebits; vm_page_assert_busied(m); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Assert that no previously invalid block that is now being validated * is already dirty. */ KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, ("vm_page_set_valid_range: page %p is dirty", m)); /* * Set valid bits inclusive of any overlap. */ pagebits = vm_page_bits(base, size); if (vm_page_xbusied(m)) m->valid |= pagebits; else vm_page_bits_set(m, &m->valid, pagebits); } /* * Set the page dirty bits and free the invalid swap space if * present. Returns the previous dirty bits. */ vm_page_bits_t vm_page_set_dirty(vm_page_t m) { vm_page_bits_t old; VM_PAGE_OBJECT_BUSY_ASSERT(m); if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) { old = m->dirty; m->dirty = VM_PAGE_BITS_ALL; } else old = vm_page_bits_swap(m, &m->dirty, VM_PAGE_BITS_ALL); if (old == 0 && (m->a.flags & PGA_SWAP_SPACE) != 0) vm_pager_page_unswapped(m); return (old); } /* * Clear the given bits from the specified page's dirty field. */ static __inline void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) { vm_page_assert_busied(m); /* * If the page is xbusied and not write mapped we are the * only thread that can modify dirty bits. Otherwise, The pmap * layer can call vm_page_dirty() without holding a distinguished * lock. The combination of page busy and atomic operations * suffice to guarantee consistency of the page dirty field. */ if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) m->dirty &= ~pagebits; else vm_page_bits_clear(m, &m->dirty, pagebits); } /* * vm_page_set_validclean: * * Sets portions of a page valid and clean. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zero'd. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_validclean(vm_page_t m, int base, int size) { vm_page_bits_t oldvalid, pagebits; int endoff, frag; vm_page_assert_busied(m); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Set valid, clear dirty bits. If validating the entire * page we can safely clear the pmap modify bit. We also * use this opportunity to clear the PGA_NOSYNC flag. If a process * takes a write fault on a MAP_NOSYNC memory area the flag will * be set again. * * We set valid bits inclusive of any overlap, but we can only * clear dirty bits for DEV_BSIZE chunks that are fully within * the range. */ oldvalid = m->valid; pagebits = vm_page_bits(base, size); if (vm_page_xbusied(m)) m->valid |= pagebits; else vm_page_bits_set(m, &m->valid, pagebits); #if 0 /* NOT YET */ if ((frag = base & (DEV_BSIZE - 1)) != 0) { frag = DEV_BSIZE - frag; base += frag; size -= frag; if (size < 0) size = 0; } pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); #endif if (base == 0 && size == PAGE_SIZE) { /* * The page can only be modified within the pmap if it is * mapped, and it can only be mapped if it was previously * fully valid. */ if (oldvalid == VM_PAGE_BITS_ALL) /* * Perform the pmap_clear_modify() first. Otherwise, * a concurrent pmap operation, such as * pmap_protect(), could clear a modification in the * pmap and set the dirty field on the page before * pmap_clear_modify() had begun and after the dirty * field was cleared here. */ pmap_clear_modify(m); m->dirty = 0; vm_page_aflag_clear(m, PGA_NOSYNC); } else if (oldvalid != VM_PAGE_BITS_ALL && vm_page_xbusied(m)) m->dirty &= ~pagebits; else vm_page_clear_dirty_mask(m, pagebits); } void vm_page_clear_dirty(vm_page_t m, int base, int size) { vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); } /* * vm_page_set_invalid: * * Invalidates DEV_BSIZE'd chunks within a page. Both the * valid and dirty bits for the effected areas are cleared. */ void vm_page_set_invalid(vm_page_t m, int base, int size) { vm_page_bits_t bits; vm_object_t object; /* * The object lock is required so that pages can't be mapped * read-only while we're in the process of invalidating them. */ object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); vm_page_assert_busied(m); if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + size >= object->un_pager.vnp.vnp_size) bits = VM_PAGE_BITS_ALL; else bits = vm_page_bits(base, size); if (object->ref_count != 0 && vm_page_all_valid(m) && bits != 0) pmap_remove_all(m); KASSERT((bits == 0 && vm_page_all_valid(m)) || !pmap_page_is_mapped(m), ("vm_page_set_invalid: page %p is mapped", m)); if (vm_page_xbusied(m)) { m->valid &= ~bits; m->dirty &= ~bits; } else { vm_page_bits_clear(m, &m->valid, bits); vm_page_bits_clear(m, &m->dirty, bits); } } /* * vm_page_invalid: * * Invalidates the entire page. The page must be busy, unmapped, and * the enclosing object must be locked. The object locks protects * against concurrent read-only pmap enter which is done without * busy. */ void vm_page_invalid(vm_page_t m) { vm_page_assert_busied(m); VM_OBJECT_ASSERT_LOCKED(m->object); MPASS(!pmap_page_is_mapped(m)); if (vm_page_xbusied(m)) m->valid = 0; else vm_page_bits_clear(m, &m->valid, VM_PAGE_BITS_ALL); } /* * vm_page_zero_invalid() * * The kernel assumes that the invalid portions of a page contain * garbage, but such pages can be mapped into memory by user code. * When this occurs, we must zero out the non-valid portions of the * page so user code sees what it expects. * * Pages are most often semi-valid when the end of a file is mapped * into memory and the file's size is not page aligned. */ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) { int b; int i; /* * Scan the valid bits looking for invalid sections that * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the * valid bit may be set ) have already been zeroed by * vm_page_set_validclean(). */ for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { if (i == (PAGE_SIZE / DEV_BSIZE) || (m->valid & ((vm_page_bits_t)1 << i))) { if (i > b) { pmap_zero_page_area(m, b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); } b = i + 1; } } /* * setvalid is TRUE when we can safely set the zero'd areas * as being valid. We can do this if there are no cache consistancy * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. */ if (setvalid) vm_page_valid(m); } /* * vm_page_is_valid: * * Is (partial) page valid? Note that the case where size == 0 * will return FALSE in the degenerate case where the page is * entirely invalid, and TRUE otherwise. * * Some callers envoke this routine without the busy lock held and * handle races via higher level locks. Typical callers should * hold a busy lock to prevent invalidation. */ int vm_page_is_valid(vm_page_t m, int base, int size) { vm_page_bits_t bits; bits = vm_page_bits(base, size); return (m->valid != 0 && (m->valid & bits) == bits); } /* * Returns true if all of the specified predicates are true for the entire * (super)page and false otherwise. */ bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m) { vm_object_t object; int i, npages; object = m->object; if (skip_m != NULL && skip_m->object != object) return (false); VM_OBJECT_ASSERT_LOCKED(object); npages = atop(pagesizes[m->psind]); /* * The physically contiguous pages that make up a superpage, i.e., a * page with a page size index ("psind") greater than zero, will * occupy adjacent entries in vm_page_array[]. */ for (i = 0; i < npages; i++) { /* Always test object consistency, including "skip_m". */ if (m[i].object != object) return (false); if (&m[i] == skip_m) continue; if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i])) return (false); if ((flags & PS_ALL_DIRTY) != 0) { /* * Calling vm_page_test_dirty() or pmap_is_modified() * might stop this case from spuriously returning * "false". However, that would require a write lock * on the object containing "m[i]". */ if (m[i].dirty != VM_PAGE_BITS_ALL) return (false); } if ((flags & PS_ALL_VALID) != 0 && m[i].valid != VM_PAGE_BITS_ALL) return (false); } return (true); } /* * Set the page's dirty bits if the page is modified. */ void vm_page_test_dirty(vm_page_t m) { vm_page_assert_busied(m); if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) vm_page_dirty(m); } void vm_page_valid(vm_page_t m) { vm_page_assert_busied(m); if (vm_page_xbusied(m)) m->valid = VM_PAGE_BITS_ALL; else vm_page_bits_set(m, &m->valid, VM_PAGE_BITS_ALL); } void vm_page_lock_KBI(vm_page_t m, const char *file, int line) { mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); } void vm_page_unlock_KBI(vm_page_t m, const char *file, int line) { mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); } int vm_page_trylock_KBI(vm_page_t m, const char *file, int line) { return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); } #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line) { vm_page_lock_assert_KBI(m, MA_OWNED, file, line); } void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) { mtx_assert_(vm_page_lockptr(m), a, file, line); } #endif #ifdef INVARIANTS void vm_page_object_busy_assert(vm_page_t m) { /* * Certain of the page's fields may only be modified by the * holder of a page or object busy. */ if (m->object != NULL && !vm_page_busied(m)) VM_OBJECT_ASSERT_BUSY(m->object); } void vm_page_assert_pga_writeable(vm_page_t m, uint16_t bits) { if ((bits & PGA_WRITEABLE) == 0) return; /* * The PGA_WRITEABLE flag can only be set if the page is * managed, is exclusively busied or the object is locked. * Currently, this flag is only set by pmap_enter(). */ KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("PGA_WRITEABLE on unmanaged page")); if (!vm_page_xbusied(m)) VM_OBJECT_ASSERT_BUSY(m->object); } #endif #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(page, vm_page_print_page_info) { db_printf("vm_cnt.v_free_count: %d\n", vm_free_count()); db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count()); db_printf("vm_cnt.v_active_count: %d\n", vm_active_count()); db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count()); db_printf("vm_cnt.v_wire_count: %d\n", vm_wire_count()); db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); } DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { int dom; db_printf("pq_free %d\n", vm_free_count()); for (dom = 0; dom < vm_ndomains; dom++) { db_printf( "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n", dom, vm_dom[dom].vmd_page_count, vm_dom[dom].vmd_free_count, vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt); } } DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) { vm_page_t m; boolean_t phys, virt; if (!have_addr) { db_printf("show pginfo addr\n"); return; } phys = strchr(modif, 'p') != NULL; virt = strchr(modif, 'v') != NULL; if (virt) m = PHYS_TO_VM_PAGE(pmap_kextract(addr)); else if (phys) m = PHYS_TO_VM_PAGE(addr); else m = (vm_page_t)addr; db_printf( "page %p obj %p pidx 0x%jx phys 0x%jx q %d ref 0x%x\n" " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, m->a.queue, m->ref_count, m->a.flags, m->oflags, m->flags, m->a.act_count, m->busy_lock, m->valid, m->dirty); } #endif /* DDB */ Index: head/sys/vm/vm_page.h =================================================================== --- head/sys/vm/vm_page.h (revision 366710) +++ head/sys/vm/vm_page.h (revision 366711) @@ -1,1064 +1,999 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.h 8.2 (Berkeley) 12/13/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Resident memory system definitions. */ #ifndef _VM_PAGE_ #define _VM_PAGE_ -#include -#include #include /* * Management of resident (logical) pages. * * A small structure is kept for each resident * page, indexed by page number. Each structure * is an element of several collections: * * A radix tree used to quickly * perform object/offset lookups * * A list of all pages for a given object, * so they can be quickly deactivated at * time of deallocation. * * An ordered list of pages due for pageout. * * In addition, the structure contains the object * and offset to which this page belongs (for pageout), * and sundry status bits. * * In general, operations on this structure's mutable fields are * synchronized using either one of or a combination of locks. If a * field is annotated with two of these locks then holding either is * sufficient for read access but both are required for write access. * The queue lock for a page depends on the value of its queue field and is * described in detail below. * * The following annotations are possible: * (A) the field must be accessed using atomic(9) and may require * additional synchronization. * (B) the page busy lock. * (C) the field is immutable. * (F) the per-domain lock for the free queues. * (M) Machine dependent, defined by pmap layer. * (O) the object that the page belongs to. * (Q) the page's queue lock. * * The busy lock is an embedded reader-writer lock that protects the * page's contents and identity (i.e., its tuple) as * well as certain valid/dirty modifications. To avoid bloating the * the page structure, the busy lock lacks some of the features available * the kernel's general-purpose synchronization primitives. As a result, * busy lock ordering rules are not verified, lock recursion is not * detected, and an attempt to xbusy a busy page or sbusy an xbusy page * results will trigger a panic rather than causing the thread to block. * vm_page_sleep_if_busy() can be used to sleep until the page's busy * state changes, after which the caller must re-lookup the page and * re-evaluate its state. vm_page_busy_acquire() will block until * the lock is acquired. * * The valid field is protected by the page busy lock (B) and object * lock (O). Transitions from invalid to valid are generally done * via I/O or zero filling and do not require the object lock. * These must be protected with the busy lock to prevent page-in or * creation races. Page invalidation generally happens as a result * of truncate or msync. When invalidated, pages must not be present * in pmap and must hold the object lock to prevent concurrent * speculative read-only mappings that do not require busy. I/O * routines may check for validity without a lock if they are prepared * to handle invalidation races with higher level locks (vnode) or are * unconcerned with races so long as they hold a reference to prevent * recycling. When a valid bit is set while holding a shared busy * lock (A) atomic operations are used to protect against concurrent * modification. * * In contrast, the synchronization of accesses to the page's * dirty field is a mix of machine dependent (M) and busy (B). In * the machine-independent layer, the page busy must be held to * operate on the field. However, the pmap layer is permitted to * set all bits within the field without holding that lock. If the * underlying architecture does not support atomic read-modify-write * operations on the field's type, then the machine-independent * layer uses a 32-bit atomic on the aligned 32-bit word that * contains the dirty field. In the machine-independent layer, * the implementation of read-modify-write operations on the * field is encapsulated in vm_page_clear_dirty_mask(). An * exclusive busy lock combined with pmap_remove_{write/all}() is the * only way to ensure a page can not become dirty. I/O generally * removes the page from pmap to ensure exclusive access and atomic * writes. * * The ref_count field tracks references to the page. References that * prevent the page from being reclaimable are called wirings and are * counted in the low bits of ref_count. The containing object's * reference, if one exists, is counted using the VPRC_OBJREF bit in the * ref_count field. Additionally, the VPRC_BLOCKED bit is used to * atomically check for wirings and prevent new wirings via * pmap_extract_and_hold(). When a page belongs to an object, it may be * wired only when the object is locked, or the page is busy, or by * pmap_extract_and_hold(). As a result, if the object is locked and the * page is not busy (or is exclusively busied by the current thread), and * the page is unmapped, its wire count will not increase. The ref_count * field is updated using atomic operations in most cases, except when it * is known that no other references to the page exist, such as in the page * allocator. A page may be present in the page queues, or even actively * scanned by the page daemon, without an explicitly counted referenced. * The page daemon must therefore handle the possibility of a concurrent * free of the page. * * The queue state of a page consists of the queue and act_count fields of * its atomically updated state, and the subset of atomic flags specified * by PGA_QUEUE_STATE_MASK. The queue field contains the page's page queue * index, or PQ_NONE if it does not belong to a page queue. To modify the * queue field, the page queue lock corresponding to the old value must be * held, unless that value is PQ_NONE, in which case the queue index must * be updated using an atomic RMW operation. There is one exception to * this rule: the page daemon may transition the queue field from * PQ_INACTIVE to PQ_NONE immediately prior to freeing the page during an * inactive queue scan. At that point the page is already dequeued and no * other references to that vm_page structure can exist. The PGA_ENQUEUED * flag, when set, indicates that the page structure is physically inserted * into the queue corresponding to the page's queue index, and may only be * set or cleared with the corresponding page queue lock held. * * To avoid contention on page queue locks, page queue operations (enqueue, * dequeue, requeue) are batched using fixed-size per-CPU queues. A * deferred operation is requested by setting one of the flags in * PGA_QUEUE_OP_MASK and inserting an entry into a batch queue. When a * queue is full, an attempt to insert a new entry will lock the page * queues and trigger processing of the pending entries. The * type-stability of vm_page structures is crucial to this scheme since the * processing of entries in a given batch queue may be deferred * indefinitely. In particular, a page may be freed with pending batch * queue entries. The page queue operation flags must be set using atomic * RWM operations. */ #if PAGE_SIZE == 4096 #define VM_PAGE_BITS_ALL 0xffu typedef uint8_t vm_page_bits_t; #elif PAGE_SIZE == 8192 #define VM_PAGE_BITS_ALL 0xffffu typedef uint16_t vm_page_bits_t; #elif PAGE_SIZE == 16384 #define VM_PAGE_BITS_ALL 0xffffffffu typedef uint32_t vm_page_bits_t; #elif PAGE_SIZE == 32768 #define VM_PAGE_BITS_ALL 0xfffffffffffffffflu typedef uint64_t vm_page_bits_t; #endif typedef union vm_page_astate { struct { uint16_t flags; uint8_t queue; uint8_t act_count; }; uint32_t _bits; } vm_page_astate_t; struct vm_page { union { TAILQ_ENTRY(vm_page) q; /* page queue or free list (Q) */ struct { SLIST_ENTRY(vm_page) ss; /* private slists */ } s; struct { u_long p; u_long v; } memguard; struct { void *slab; void *zone; } uma; } plinks; TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */ vm_object_t object; /* which object am I in (O) */ vm_pindex_t pindex; /* offset into object (O,P) */ vm_paddr_t phys_addr; /* physical address of page (C) */ struct md_page md; /* machine dependent stuff */ u_int ref_count; /* page references (A) */ u_int busy_lock; /* busy owners lock (A) */ union vm_page_astate a; /* state accessed atomically (A) */ uint8_t order; /* index of the buddy queue (F) */ uint8_t pool; /* vm_phys freepool index (F) */ uint8_t flags; /* page PG_* flags (P) */ uint8_t oflags; /* page VPO_* flags (O) */ int8_t psind; /* pagesizes[] index (O) */ int8_t segind; /* vm_phys segment index (C) */ /* NOTE that these must support one bit per DEV_BSIZE in a page */ /* so, on normal X86 kernels, they must be at least 8 bits wide */ vm_page_bits_t valid; /* valid DEV_BSIZE chunk map (O,B) */ vm_page_bits_t dirty; /* dirty DEV_BSIZE chunk map (M,B) */ }; /* * Special bits used in the ref_count field. * * ref_count is normally used to count wirings that prevent the page from being * reclaimed, but also supports several special types of references that do not * prevent reclamation. Accesses to the ref_count field must be atomic unless * the page is unallocated. * * VPRC_OBJREF is the reference held by the containing object. It can set or * cleared only when the corresponding object's write lock is held. * * VPRC_BLOCKED is used to atomically block wirings via pmap lookups while * attempting to tear down all mappings of a given page. The page busy lock and * object write lock must both be held in order to set or clear this bit. */ #define VPRC_BLOCKED 0x40000000u /* mappings are being removed */ #define VPRC_OBJREF 0x80000000u /* object reference, cleared with (O) */ #define VPRC_WIRE_COUNT(c) ((c) & ~(VPRC_BLOCKED | VPRC_OBJREF)) #define VPRC_WIRE_COUNT_MAX (~(VPRC_BLOCKED | VPRC_OBJREF)) /* * Page flags stored in oflags: * * Access to these page flags is synchronized by the lock on the object * containing the page (O). * * Note: VPO_UNMANAGED (used by OBJT_DEVICE, OBJT_PHYS and OBJT_SG) * indicates that the page is not under PV management but * otherwise should be treated as a normal page. Pages not * under PV management cannot be paged out via the * object/vm_page_t because there is no knowledge of their pte * mappings, and such pages are also not on any PQ queue. * */ #define VPO_KMEM_EXEC 0x01 /* kmem mapping allows execution */ #define VPO_SWAPSLEEP 0x02 /* waiting for swap to finish */ #define VPO_UNMANAGED 0x04 /* no PV management for page */ #define VPO_SWAPINPROG 0x08 /* swap I/O in progress on page */ /* * Busy page implementation details. * The algorithm is taken mostly by rwlock(9) and sx(9) locks implementation, * even if the support for owner identity is removed because of size * constraints. Checks on lock recursion are then not possible, while the * lock assertions effectiveness is someway reduced. */ #define VPB_BIT_SHARED 0x01 #define VPB_BIT_EXCLUSIVE 0x02 #define VPB_BIT_WAITERS 0x04 #define VPB_BIT_FLAGMASK \ (VPB_BIT_SHARED | VPB_BIT_EXCLUSIVE | VPB_BIT_WAITERS) #define VPB_SHARERS_SHIFT 3 #define VPB_SHARERS(x) \ (((x) & ~VPB_BIT_FLAGMASK) >> VPB_SHARERS_SHIFT) #define VPB_SHARERS_WORD(x) ((x) << VPB_SHARERS_SHIFT | VPB_BIT_SHARED) #define VPB_ONE_SHARER (1 << VPB_SHARERS_SHIFT) #define VPB_SINGLE_EXCLUSIVE VPB_BIT_EXCLUSIVE #ifdef INVARIANTS #define VPB_CURTHREAD_EXCLUSIVE \ (VPB_BIT_EXCLUSIVE | ((u_int)(uintptr_t)curthread & ~VPB_BIT_FLAGMASK)) #else #define VPB_CURTHREAD_EXCLUSIVE VPB_SINGLE_EXCLUSIVE #endif #define VPB_UNBUSIED VPB_SHARERS_WORD(0) /* Freed lock blocks both shared and exclusive. */ #define VPB_FREED (0xffffffff - VPB_BIT_SHARED) #define PQ_NONE 255 #define PQ_INACTIVE 0 #define PQ_ACTIVE 1 #define PQ_LAUNDRY 2 #define PQ_UNSWAPPABLE 3 #define PQ_COUNT 4 #ifndef VM_PAGE_HAVE_PGLIST TAILQ_HEAD(pglist, vm_page); #define VM_PAGE_HAVE_PGLIST #endif SLIST_HEAD(spglist, vm_page); #ifdef _KERNEL extern vm_page_t bogus_page; #endif /* _KERNEL */ extern struct mtx_padalign pa_lock[]; #if defined(__arm__) #define PDRSHIFT PDR_SHIFT #elif !defined(PDRSHIFT) #define PDRSHIFT 21 #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define PA_LOCKPTR(pa) ((struct mtx *)(&pa_lock[pa_index(pa) % PA_LOCK_COUNT])) #define PA_LOCKOBJPTR(pa) ((struct lock_object *)PA_LOCKPTR((pa))) #define PA_LOCK(pa) mtx_lock(PA_LOCKPTR(pa)) #define PA_TRYLOCK(pa) mtx_trylock(PA_LOCKPTR(pa)) #define PA_UNLOCK(pa) mtx_unlock(PA_LOCKPTR(pa)) #define PA_UNLOCK_COND(pa) \ do { \ if ((pa) != 0) { \ PA_UNLOCK((pa)); \ (pa) = 0; \ } \ } while (0) #define PA_LOCK_ASSERT(pa, a) mtx_assert(PA_LOCKPTR(pa), (a)) #if defined(KLD_MODULE) && !defined(KLD_TIED) #define vm_page_lock(m) vm_page_lock_KBI((m), LOCK_FILE, LOCK_LINE) #define vm_page_unlock(m) vm_page_unlock_KBI((m), LOCK_FILE, LOCK_LINE) #define vm_page_trylock(m) vm_page_trylock_KBI((m), LOCK_FILE, LOCK_LINE) #else /* !KLD_MODULE */ #define vm_page_lockptr(m) (PA_LOCKPTR(VM_PAGE_TO_PHYS((m)))) #define vm_page_lock(m) mtx_lock(vm_page_lockptr((m))) #define vm_page_unlock(m) mtx_unlock(vm_page_lockptr((m))) #define vm_page_trylock(m) mtx_trylock(vm_page_lockptr((m))) #endif #if defined(INVARIANTS) #define vm_page_assert_locked(m) \ vm_page_assert_locked_KBI((m), __FILE__, __LINE__) #define vm_page_lock_assert(m, a) \ vm_page_lock_assert_KBI((m), (a), __FILE__, __LINE__) #else #define vm_page_assert_locked(m) #define vm_page_lock_assert(m, a) #endif /* * The vm_page's aflags are updated using atomic operations. To set or clear * these flags, the functions vm_page_aflag_set() and vm_page_aflag_clear() * must be used. Neither these flags nor these functions are part of the KBI. * * PGA_REFERENCED may be cleared only if the page is locked. It is set by * both the MI and MD VM layers. However, kernel loadable modules should not * directly set this flag. They should call vm_page_reference() instead. * * PGA_WRITEABLE is set exclusively on managed pages by pmap_enter(). * When it does so, the object must be locked, or the page must be * exclusive busied. The MI VM layer must never access this flag * directly. Instead, it should call pmap_page_is_write_mapped(). * * PGA_EXECUTABLE may be set by pmap routines, and indicates that a page has * at least one executable mapping. It is not consumed by the MI VM layer. * * PGA_NOSYNC must be set and cleared with the page busy lock held. * * PGA_ENQUEUED is set and cleared when a page is inserted into or removed * from a page queue, respectively. It determines whether the plinks.q field * of the page is valid. To set or clear this flag, page's "queue" field must * be a valid queue index, and the corresponding page queue lock must be held. * * PGA_DEQUEUE is set when the page is scheduled to be dequeued from a page * queue, and cleared when the dequeue request is processed. A page may * have PGA_DEQUEUE set and PGA_ENQUEUED cleared, for instance if a dequeue * is requested after the page is scheduled to be enqueued but before it is * actually inserted into the page queue. * * PGA_REQUEUE is set when the page is scheduled to be enqueued or requeued * in its page queue. * * PGA_REQUEUE_HEAD is a special flag for enqueuing pages near the head of * the inactive queue, thus bypassing LRU. * * The PGA_DEQUEUE, PGA_REQUEUE and PGA_REQUEUE_HEAD flags must be set using an * atomic RMW operation to ensure that the "queue" field is a valid queue index, * and the corresponding page queue lock must be held when clearing any of the * flags. * * PGA_SWAP_FREE is used to defer freeing swap space to the pageout daemon * when the context that dirties the page does not have the object write lock * held. */ #define PGA_WRITEABLE 0x0001 /* page may be mapped writeable */ #define PGA_REFERENCED 0x0002 /* page has been referenced */ #define PGA_EXECUTABLE 0x0004 /* page may be mapped executable */ #define PGA_ENQUEUED 0x0008 /* page is enqueued in a page queue */ #define PGA_DEQUEUE 0x0010 /* page is due to be dequeued */ #define PGA_REQUEUE 0x0020 /* page is due to be requeued */ #define PGA_REQUEUE_HEAD 0x0040 /* page requeue should bypass LRU */ #define PGA_NOSYNC 0x0080 /* do not collect for syncer */ #define PGA_SWAP_FREE 0x0100 /* page with swap space was dirtied */ #define PGA_SWAP_SPACE 0x0200 /* page has allocated swap space */ #define PGA_QUEUE_OP_MASK (PGA_DEQUEUE | PGA_REQUEUE | PGA_REQUEUE_HEAD) #define PGA_QUEUE_STATE_MASK (PGA_ENQUEUED | PGA_QUEUE_OP_MASK) /* * Page flags. Updates to these flags are not synchronized, and thus they must * be set during page allocation or free to avoid races. * * The PG_PCPU_CACHE flag is set at allocation time if the page was * allocated from a per-CPU cache. It is cleared the next time that the * page is allocated from the physical memory allocator. */ #define PG_PCPU_CACHE 0x01 /* was allocated from per-CPU caches */ #define PG_FICTITIOUS 0x02 /* physical page doesn't exist */ #define PG_ZERO 0x04 /* page is zeroed */ #define PG_MARKER 0x08 /* special queue marker page */ #define PG_NODUMP 0x10 /* don't include this page in a dump */ /* * Misc constants. */ #define ACT_DECLINE 1 #define ACT_ADVANCE 3 #define ACT_INIT 5 #define ACT_MAX 64 #ifdef _KERNEL #include #include /* * Each pageable resident page falls into one of five lists: * * free * Available for allocation now. * * inactive * Low activity, candidates for reclamation. * This list is approximately LRU ordered. * * laundry * This is the list of pages that should be * paged out next. * * unswappable * Dirty anonymous pages that cannot be paged * out because no swap device is configured. * * active * Pages that are "active", i.e., they have been * recently referenced. * */ extern vm_page_t vm_page_array; /* First resident page in table */ extern long vm_page_array_size; /* number of vm_page_t's */ extern long first_page; /* first physical page number */ #define VM_PAGE_TO_PHYS(entry) ((entry)->phys_addr) /* * PHYS_TO_VM_PAGE() returns the vm_page_t object that represents a memory * page to which the given physical address belongs. The correct vm_page_t * object is returned for addresses that are not page-aligned. */ vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa); /* * Page allocation parameters for vm_page for the functions * vm_page_alloc(), vm_page_grab(), vm_page_alloc_contig() and * vm_page_alloc_freelist(). Some functions support only a subset * of the flags, and ignore others, see the flags legend. * * The meaning of VM_ALLOC_ZERO differs slightly between the vm_page_alloc*() * and the vm_page_grab*() functions. See these functions for details. * * Bits 0 - 1 define class. * Bits 2 - 15 dedicated for flags. * Legend: * (a) - vm_page_alloc() supports the flag. * (c) - vm_page_alloc_contig() supports the flag. * (f) - vm_page_alloc_freelist() supports the flag. * (g) - vm_page_grab() supports the flag. * (p) - vm_page_grab_pages() supports the flag. * Bits above 15 define the count of additional pages that the caller * intends to allocate. */ #define VM_ALLOC_NORMAL 0 #define VM_ALLOC_INTERRUPT 1 #define VM_ALLOC_SYSTEM 2 #define VM_ALLOC_CLASS_MASK 3 #define VM_ALLOC_WAITOK 0x0008 /* (acf) Sleep and retry */ #define VM_ALLOC_WAITFAIL 0x0010 /* (acf) Sleep and return error */ #define VM_ALLOC_WIRED 0x0020 /* (acfgp) Allocate a wired page */ #define VM_ALLOC_ZERO 0x0040 /* (acfgp) Allocate a prezeroed page */ #define VM_ALLOC_NOOBJ 0x0100 /* (acg) No associated object */ #define VM_ALLOC_NOBUSY 0x0200 /* (acgp) Do not excl busy the page */ #define VM_ALLOC_NOCREAT 0x0400 /* (gp) Don't create a page */ #define VM_ALLOC_IGN_SBUSY 0x1000 /* (gp) Ignore shared busy flag */ #define VM_ALLOC_NODUMP 0x2000 /* (ag) don't include in dump */ #define VM_ALLOC_SBUSY 0x4000 /* (acgp) Shared busy the page */ #define VM_ALLOC_NOWAIT 0x8000 /* (acfgp) Do not sleep */ #define VM_ALLOC_COUNT_SHIFT 16 #define VM_ALLOC_COUNT(count) ((count) << VM_ALLOC_COUNT_SHIFT) #ifdef M_NOWAIT static inline int malloc2vm_flags(int malloc_flags) { int pflags; KASSERT((malloc_flags & M_USE_RESERVE) == 0 || (malloc_flags & M_NOWAIT) != 0, ("M_USE_RESERVE requires M_NOWAIT")); pflags = (malloc_flags & M_USE_RESERVE) != 0 ? VM_ALLOC_INTERRUPT : VM_ALLOC_SYSTEM; if ((malloc_flags & M_ZERO) != 0) pflags |= VM_ALLOC_ZERO; if ((malloc_flags & M_NODUMP) != 0) pflags |= VM_ALLOC_NODUMP; if ((malloc_flags & M_NOWAIT)) pflags |= VM_ALLOC_NOWAIT; if ((malloc_flags & M_WAITOK)) pflags |= VM_ALLOC_WAITOK; return (pflags); } #endif /* * Predicates supported by vm_page_ps_test(): * * PS_ALL_DIRTY is true only if the entire (super)page is dirty. * However, it can be spuriously false when the (super)page has become * dirty in the pmap but that information has not been propagated to the * machine-independent layer. */ #define PS_ALL_DIRTY 0x1 #define PS_ALL_VALID 0x2 #define PS_NONE_BUSY 0x4 - -extern struct bitset *vm_page_dump; -extern long vm_page_dump_pages; -extern vm_paddr_t dump_avail[]; - -static inline void -dump_add_page(vm_paddr_t pa) -{ - vm_pindex_t adj; - int i; - - adj = 0; - for (i = 0; dump_avail[i + 1] != 0; i += 2) { - if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) { - BIT_SET_ATOMIC(vm_page_dump_pages, - (pa >> PAGE_SHIFT) - (dump_avail[i] >> PAGE_SHIFT) + - adj, vm_page_dump); - return; - } - adj += howmany(dump_avail[i + 1], PAGE_SIZE) - - dump_avail[i] / PAGE_SIZE; - } -} - -static inline void -dump_drop_page(vm_paddr_t pa) -{ - vm_pindex_t adj; - int i; - - adj = 0; - for (i = 0; dump_avail[i + 1] != 0; i += 2) { - if (pa >= dump_avail[i] && pa < dump_avail[i + 1]) { - BIT_CLR_ATOMIC(vm_page_dump_pages, - (pa >> PAGE_SHIFT) - (dump_avail[i] >> PAGE_SHIFT) + - adj, vm_page_dump); - return; - } - adj += howmany(dump_avail[i + 1], PAGE_SIZE) - - dump_avail[i] / PAGE_SIZE; - } -} - -static inline vm_paddr_t -vm_page_dump_index_to_pa(int bit) -{ - int i, tot; - - for (i = 0; dump_avail[i + 1] != 0; i += 2) { - tot = howmany(dump_avail[i + 1], PAGE_SIZE) - - dump_avail[i] / PAGE_SIZE; - if (bit < tot) - return ((vm_paddr_t)bit * PAGE_SIZE + - (dump_avail[i] & ~PAGE_MASK)); - bit -= tot; - } - return ((vm_paddr_t)NULL); -} - -#define VM_PAGE_DUMP_FOREACH(pa) \ - for (vm_pindex_t __b = BIT_FFS(vm_page_dump_pages, vm_page_dump); \ - (pa) = vm_page_dump_index_to_pa(__b - 1), __b != 0; \ - __b = BIT_FFS_AT(vm_page_dump_pages, vm_page_dump, __b)) bool vm_page_busy_acquire(vm_page_t m, int allocflags); void vm_page_busy_downgrade(vm_page_t m); int vm_page_busy_tryupgrade(vm_page_t m); void vm_page_busy_sleep(vm_page_t m, const char *msg, bool nonshared); void vm_page_busy_sleep_unlocked(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, const char *wmesg, bool nonshared); void vm_page_free(vm_page_t m); void vm_page_free_zero(vm_page_t m); void vm_page_activate (vm_page_t); void vm_page_advise(vm_page_t m, int advice); vm_page_t vm_page_alloc(vm_object_t, vm_pindex_t, int); vm_page_t vm_page_alloc_domain(vm_object_t, vm_pindex_t, int, int); vm_page_t vm_page_alloc_after(vm_object_t, vm_pindex_t, int, vm_page_t); vm_page_t vm_page_alloc_domain_after(vm_object_t, vm_pindex_t, int, int, vm_page_t); vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr); vm_page_t vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr); vm_page_t vm_page_alloc_freelist(int, int); vm_page_t vm_page_alloc_freelist_domain(int, int, int); void vm_page_bits_set(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t set); bool vm_page_blacklist_add(vm_paddr_t pa, bool verbose); vm_page_t vm_page_grab(vm_object_t, vm_pindex_t, int); vm_page_t vm_page_grab_unlocked(vm_object_t, vm_pindex_t, int); int vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags, vm_page_t *ma, int count); int vm_page_grab_pages_unlocked(vm_object_t object, vm_pindex_t pindex, int allocflags, vm_page_t *ma, int count); int vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int allocflags); int vm_page_grab_valid_unlocked(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int allocflags); void vm_page_deactivate(vm_page_t); void vm_page_deactivate_noreuse(vm_page_t); void vm_page_dequeue(vm_page_t m); void vm_page_dequeue_deferred(vm_page_t m); vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t); void vm_page_free_invalid(vm_page_t); vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_init_marker(vm_page_t marker, int queue, uint16_t aflags); int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t); void vm_page_invalid(vm_page_t m); void vm_page_launder(vm_page_t m); vm_page_t vm_page_lookup(vm_object_t, vm_pindex_t); vm_page_t vm_page_lookup_unlocked(vm_object_t, vm_pindex_t); vm_page_t vm_page_next(vm_page_t m); void vm_page_pqbatch_drain(void); void vm_page_pqbatch_submit(vm_page_t m, uint8_t queue); bool vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new); vm_page_t vm_page_prev(vm_page_t m); bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m); void vm_page_putfake(vm_page_t m); void vm_page_readahead_finish(vm_page_t m); bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); bool vm_page_reclaim_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); void vm_page_reference(vm_page_t m); #define VPR_TRYFREE 0x01 #define VPR_NOREUSE 0x02 void vm_page_release(vm_page_t m, int flags); void vm_page_release_locked(vm_page_t m, int flags); vm_page_t vm_page_relookup(vm_object_t, vm_pindex_t); bool vm_page_remove(vm_page_t); bool vm_page_remove_xbusy(vm_page_t); int vm_page_rename(vm_page_t, vm_object_t, vm_pindex_t); void vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, vm_page_t mold); int vm_page_sbusied(vm_page_t m); vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options); vm_page_bits_t vm_page_set_dirty(vm_page_t m); void vm_page_set_valid_range(vm_page_t m, int base, int size); int vm_page_sleep_if_busy(vm_page_t m, const char *msg); int vm_page_sleep_if_xbusy(vm_page_t m, const char *msg); vm_offset_t vm_page_startup(vm_offset_t vaddr); void vm_page_sunbusy(vm_page_t m); bool vm_page_try_remove_all(vm_page_t m); bool vm_page_try_remove_write(vm_page_t m); int vm_page_trysbusy(vm_page_t m); int vm_page_tryxbusy(vm_page_t m); void vm_page_unhold_pages(vm_page_t *ma, int count); void vm_page_unswappable(vm_page_t m); void vm_page_unwire(vm_page_t m, uint8_t queue); bool vm_page_unwire_noq(vm_page_t m); void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_wire(vm_page_t); bool vm_page_wire_mapped(vm_page_t m); void vm_page_xunbusy_hard(vm_page_t m); void vm_page_xunbusy_hard_unchecked(vm_page_t m); void vm_page_set_validclean (vm_page_t, int, int); void vm_page_clear_dirty(vm_page_t, int, int); void vm_page_set_invalid(vm_page_t, int, int); void vm_page_valid(vm_page_t m); int vm_page_is_valid(vm_page_t, int, int); void vm_page_test_dirty(vm_page_t); vm_page_bits_t vm_page_bits(int base, int size); void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid); void vm_page_free_pages_toq(struct spglist *free, bool update_wire_count); void vm_page_dirty_KBI(vm_page_t m); void vm_page_lock_KBI(vm_page_t m, const char *file, int line); void vm_page_unlock_KBI(vm_page_t m, const char *file, int line); int vm_page_trylock_KBI(vm_page_t m, const char *file, int line); #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line); void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line); #endif #define vm_page_busy_fetch(m) atomic_load_int(&(m)->busy_lock) #define vm_page_assert_busied(m) \ KASSERT(vm_page_busied(m), \ ("vm_page_assert_busied: page %p not busy @ %s:%d", \ (m), __FILE__, __LINE__)) #define vm_page_assert_sbusied(m) \ KASSERT(vm_page_sbusied(m), \ ("vm_page_assert_sbusied: page %p not shared busy @ %s:%d", \ (m), __FILE__, __LINE__)) #define vm_page_assert_unbusied(m) \ KASSERT((vm_page_busy_fetch(m) & ~VPB_BIT_WAITERS) != \ VPB_CURTHREAD_EXCLUSIVE, \ ("vm_page_assert_xbusied: page %p busy_lock %#x owned" \ " by me @ %s:%d", \ (m), (m)->busy_lock, __FILE__, __LINE__)); \ #define vm_page_assert_xbusied_unchecked(m) do { \ KASSERT(vm_page_xbusied(m), \ ("vm_page_assert_xbusied: page %p not exclusive busy @ %s:%d", \ (m), __FILE__, __LINE__)); \ } while (0) #define vm_page_assert_xbusied(m) do { \ vm_page_assert_xbusied_unchecked(m); \ KASSERT((vm_page_busy_fetch(m) & ~VPB_BIT_WAITERS) == \ VPB_CURTHREAD_EXCLUSIVE, \ ("vm_page_assert_xbusied: page %p busy_lock %#x not owned" \ " by me @ %s:%d", \ (m), (m)->busy_lock, __FILE__, __LINE__)); \ } while (0) #define vm_page_busied(m) \ (vm_page_busy_fetch(m) != VPB_UNBUSIED) #define vm_page_sbusy(m) do { \ if (!vm_page_trysbusy(m)) \ panic("%s: page %p failed shared busying", __func__, \ (m)); \ } while (0) #define vm_page_xbusied(m) \ ((vm_page_busy_fetch(m) & VPB_SINGLE_EXCLUSIVE) != 0) #define vm_page_busy_freed(m) \ (vm_page_busy_fetch(m) == VPB_FREED) #define vm_page_xbusy(m) do { \ if (!vm_page_tryxbusy(m)) \ panic("%s: page %p failed exclusive busying", __func__, \ (m)); \ } while (0) /* Note: page m's lock must not be owned by the caller. */ #define vm_page_xunbusy(m) do { \ if (!atomic_cmpset_rel_int(&(m)->busy_lock, \ VPB_CURTHREAD_EXCLUSIVE, VPB_UNBUSIED)) \ vm_page_xunbusy_hard(m); \ } while (0) #define vm_page_xunbusy_unchecked(m) do { \ if (!atomic_cmpset_rel_int(&(m)->busy_lock, \ VPB_CURTHREAD_EXCLUSIVE, VPB_UNBUSIED)) \ vm_page_xunbusy_hard_unchecked(m); \ } while (0) #ifdef INVARIANTS void vm_page_object_busy_assert(vm_page_t m); #define VM_PAGE_OBJECT_BUSY_ASSERT(m) vm_page_object_busy_assert(m) void vm_page_assert_pga_writeable(vm_page_t m, uint16_t bits); #define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) \ vm_page_assert_pga_writeable(m, bits) /* * Claim ownership of a page's xbusy state. In non-INVARIANTS kernels this * operation is a no-op since ownership is not tracked. In particular * this macro does not provide any synchronization with the previous owner. */ #define vm_page_xbusy_claim(m) do { \ u_int _busy_lock; \ \ vm_page_assert_xbusied_unchecked((m)); \ do { \ _busy_lock = vm_page_busy_fetch(m); \ } while (!atomic_cmpset_int(&(m)->busy_lock, _busy_lock, \ (_busy_lock & VPB_BIT_FLAGMASK) | VPB_CURTHREAD_EXCLUSIVE)); \ } while (0) #else #define VM_PAGE_OBJECT_BUSY_ASSERT(m) (void)0 #define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) (void)0 #define vm_page_xbusy_claim(m) #endif #if BYTE_ORDER == BIG_ENDIAN #define VM_PAGE_AFLAG_SHIFT 16 #else #define VM_PAGE_AFLAG_SHIFT 0 #endif /* * Load a snapshot of a page's 32-bit atomic state. */ static inline vm_page_astate_t vm_page_astate_load(vm_page_t m) { vm_page_astate_t a; a._bits = atomic_load_32(&m->a._bits); return (a); } /* * Atomically compare and set a page's atomic state. */ static inline bool vm_page_astate_fcmpset(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { KASSERT(new.queue == PQ_INACTIVE || (new.flags & PGA_REQUEUE_HEAD) == 0, ("%s: invalid head requeue request for page %p", __func__, m)); KASSERT((new.flags & PGA_ENQUEUED) == 0 || new.queue != PQ_NONE, ("%s: setting PGA_ENQUEUED with PQ_NONE in page %p", __func__, m)); KASSERT(new._bits != old->_bits, ("%s: bits are unchanged", __func__)); return (atomic_fcmpset_32(&m->a._bits, &old->_bits, new._bits) != 0); } /* * Clear the given bits in the specified page. */ static inline void vm_page_aflag_clear(vm_page_t m, uint16_t bits) { uint32_t *addr, val; /* * Access the whole 32-bit word containing the aflags field with an * atomic update. Parallel non-atomic updates to the other fields * within this word are handled properly by the atomic update. */ addr = (void *)&m->a; val = bits << VM_PAGE_AFLAG_SHIFT; atomic_clear_32(addr, val); } /* * Set the given bits in the specified page. */ static inline void vm_page_aflag_set(vm_page_t m, uint16_t bits) { uint32_t *addr, val; VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits); /* * Access the whole 32-bit word containing the aflags field with an * atomic update. Parallel non-atomic updates to the other fields * within this word are handled properly by the atomic update. */ addr = (void *)&m->a; val = bits << VM_PAGE_AFLAG_SHIFT; atomic_set_32(addr, val); } /* * vm_page_dirty: * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). */ static __inline void vm_page_dirty(vm_page_t m) { /* Use vm_page_dirty_KBI() under INVARIANTS to save memory. */ #if (defined(KLD_MODULE) && !defined(KLD_TIED)) || defined(INVARIANTS) vm_page_dirty_KBI(m); #else m->dirty = VM_PAGE_BITS_ALL; #endif } /* * vm_page_undirty: * * Set page to not be dirty. Note: does not clear pmap modify bits */ static __inline void vm_page_undirty(vm_page_t m) { VM_PAGE_OBJECT_BUSY_ASSERT(m); m->dirty = 0; } static inline uint8_t _vm_page_queue(vm_page_astate_t as) { if ((as.flags & PGA_DEQUEUE) != 0) return (PQ_NONE); return (as.queue); } /* * vm_page_queue: * * Return the index of the queue containing m. */ static inline uint8_t vm_page_queue(vm_page_t m) { return (_vm_page_queue(vm_page_astate_load(m))); } static inline bool vm_page_active(vm_page_t m) { return (vm_page_queue(m) == PQ_ACTIVE); } static inline bool vm_page_inactive(vm_page_t m) { return (vm_page_queue(m) == PQ_INACTIVE); } static inline bool vm_page_in_laundry(vm_page_t m) { uint8_t queue; queue = vm_page_queue(m); return (queue == PQ_LAUNDRY || queue == PQ_UNSWAPPABLE); } /* * vm_page_drop: * * Release a reference to a page and return the old reference count. */ static inline u_int vm_page_drop(vm_page_t m, u_int val) { u_int old; /* * Synchronize with vm_page_free_prep(): ensure that all updates to the * page structure are visible before it is freed. */ atomic_thread_fence_rel(); old = atomic_fetchadd_int(&m->ref_count, -val); KASSERT(old != VPRC_BLOCKED, ("vm_page_drop: page %p has an invalid refcount value", m)); return (old); } /* * vm_page_wired: * * Perform a racy check to determine whether a reference prevents the page * from being reclaimable. If the page's object is locked, and the page is * unmapped and exclusively busied by the current thread, no new wirings * may be created. */ static inline bool vm_page_wired(vm_page_t m) { return (VPRC_WIRE_COUNT(m->ref_count) > 0); } static inline bool vm_page_all_valid(vm_page_t m) { return (m->valid == VM_PAGE_BITS_ALL); } static inline bool vm_page_none_valid(vm_page_t m) { return (m->valid == 0); } #endif /* _KERNEL */ #endif /* !_VM_PAGE_ */ Index: head/sys/vm/vm_phys.h =================================================================== --- head/sys/vm/vm_phys.h (revision 366710) +++ head/sys/vm/vm_phys.h (revision 366711) @@ -1,141 +1,140 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2006 Rice University * Copyright (c) 2007 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* * Physical memory system definitions */ #ifndef _VM_PHYS_H_ #define _VM_PHYS_H_ #ifdef _KERNEL #ifndef VM_NFREEORDER_MAX #define VM_NFREEORDER_MAX VM_NFREEORDER #endif extern vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; -extern vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; /* Domains must be dense (non-sparse) and zero-based. */ struct mem_affinity { vm_paddr_t start; vm_paddr_t end; int domain; }; #ifdef NUMA extern struct mem_affinity *mem_affinity; extern int *mem_locality; #endif struct vm_freelist { struct pglist pl; int lcnt; }; struct vm_phys_seg { vm_paddr_t start; vm_paddr_t end; vm_page_t first_page; #if VM_NRESERVLEVEL > 0 vm_reserv_t first_reserv; #endif void *md_first; int domain; struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX]; }; extern struct vm_phys_seg vm_phys_segs[]; extern int vm_phys_nsegs; /* * The following functions are only to be used by the virtual memory system. */ void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end); vm_page_t vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); vm_page_t vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order); int vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[]); vm_page_t vm_phys_alloc_pages(int domain, int pool, int order); int vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high); void vm_phys_enqueue_contig(vm_page_t m, u_long npages); int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, vm_memattr_t memattr); void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end); vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa); void vm_phys_free_contig(vm_page_t m, u_long npages); void vm_phys_free_pages(vm_page_t m, int order); void vm_phys_init(void); vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa); void vm_phys_register_domains(int ndomains, struct mem_affinity *affinity, int *locality); vm_page_t vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, int options); void vm_phys_set_pool(int pool, vm_page_t m, int order); boolean_t vm_phys_unfree_page(vm_page_t m); int vm_phys_mem_affinity(int f, int t); void vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end); vm_paddr_t vm_phys_early_alloc(int domain, size_t alloc_size); void vm_phys_early_startup(void); int vm_phys_avail_largest(void); vm_paddr_t vm_phys_avail_size(int i); /* * * vm_phys_domain: * * Return the index of the domain the page belongs to. */ static inline int vm_phys_domain(vm_page_t m) { #ifdef NUMA int domn, segind; /* XXXKIB try to assert that the page is managed */ segind = m->segind; KASSERT(segind < vm_phys_nsegs, ("segind %d m %p", segind, m)); domn = vm_phys_segs[segind].domain; KASSERT(domn < vm_ndomains, ("domain %d m %p", domn, m)); return (domn); #else return (0); #endif } int _vm_phys_domain(vm_paddr_t pa); #endif /* _KERNEL */ #endif /* !_VM_PHYS_H_ */ Index: head/sys/x86/x86/nexus.c =================================================================== --- head/sys/x86/x86/nexus.c (revision 366710) +++ head/sys/x86/x86/nexus.c (revision 366711) @@ -1,922 +1,923 @@ /*- * Copyright 1998 Massachusetts Institute of Technology * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * This code implements a `root nexus' for Intel Architecture * machines. The function of the root nexus is to serve as an * attachment point for both processors and buses, and to manage * resources which are common to all of them. In particular, * this code implements the core resource managers for interrupt * requests, DMA requests (which rightfully should be a part of the * ISA code but it's easier to do it here for now), I/O port addresses, * and I/O memory address space. */ #ifdef __amd64__ #define DEV_APIC #else #include "opt_apic.h" #endif #include "opt_isa.h" #include "opt_pci.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #ifdef DEV_APIC #include "pcib_if.h" #endif #ifdef DEV_ISA #include #include #endif #include #define ELF_KERN_STR ("elf"__XSTRING(__ELF_WORD_SIZE)" kernel") static MALLOC_DEFINE(M_NEXUSDEV, "nexusdev", "Nexus device"); #define DEVTONX(dev) ((struct nexus_device *)device_get_ivars(dev)) struct rman irq_rman, drq_rman, port_rman, mem_rman; static int nexus_probe(device_t); static int nexus_attach(device_t); static int nexus_print_all_resources(device_t dev); static int nexus_print_child(device_t, device_t); static device_t nexus_add_child(device_t bus, u_int order, const char *name, int unit); static struct resource *nexus_alloc_resource(device_t, device_t, int, int *, rman_res_t, rman_res_t, rman_res_t, u_int); static int nexus_adjust_resource(device_t, device_t, int, struct resource *, rman_res_t, rman_res_t); #ifdef SMP static int nexus_bind_intr(device_t, device_t, struct resource *, int); #endif static int nexus_config_intr(device_t, int, enum intr_trigger, enum intr_polarity); static int nexus_describe_intr(device_t dev, device_t child, struct resource *irq, void *cookie, const char *descr); static int nexus_activate_resource(device_t, device_t, int, int, struct resource *); static int nexus_deactivate_resource(device_t, device_t, int, int, struct resource *); static int nexus_map_resource(device_t bus, device_t child, int type, struct resource *r, struct resource_map_request *argsp, struct resource_map *map); static int nexus_unmap_resource(device_t bus, device_t child, int type, struct resource *r, struct resource_map *map); static int nexus_release_resource(device_t, device_t, int, int, struct resource *); static int nexus_setup_intr(device_t, device_t, struct resource *, int flags, driver_filter_t filter, void (*)(void *), void *, void **); static int nexus_teardown_intr(device_t, device_t, struct resource *, void *); static int nexus_suspend_intr(device_t, device_t, struct resource *); static int nexus_resume_intr(device_t, device_t, struct resource *); static struct resource_list *nexus_get_reslist(device_t dev, device_t child); static int nexus_set_resource(device_t, device_t, int, int, rman_res_t, rman_res_t); static int nexus_get_resource(device_t, device_t, int, int, rman_res_t *, rman_res_t *); static void nexus_delete_resource(device_t, device_t, int, int); static int nexus_get_cpus(device_t, device_t, enum cpu_sets, size_t, cpuset_t *); #if defined(DEV_APIC) && defined(DEV_PCI) static int nexus_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs); static int nexus_release_msi(device_t pcib, device_t dev, int count, int *irqs); static int nexus_alloc_msix(device_t pcib, device_t dev, int *irq); static int nexus_release_msix(device_t pcib, device_t dev, int irq); static int nexus_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr, uint32_t *data); #endif static device_method_t nexus_methods[] = { /* Device interface */ DEVMETHOD(device_probe, nexus_probe), DEVMETHOD(device_attach, nexus_attach), DEVMETHOD(device_detach, bus_generic_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, bus_generic_suspend), DEVMETHOD(device_resume, bus_generic_resume), /* Bus interface */ DEVMETHOD(bus_print_child, nexus_print_child), DEVMETHOD(bus_add_child, nexus_add_child), DEVMETHOD(bus_alloc_resource, nexus_alloc_resource), DEVMETHOD(bus_adjust_resource, nexus_adjust_resource), DEVMETHOD(bus_release_resource, nexus_release_resource), DEVMETHOD(bus_activate_resource, nexus_activate_resource), DEVMETHOD(bus_deactivate_resource, nexus_deactivate_resource), DEVMETHOD(bus_map_resource, nexus_map_resource), DEVMETHOD(bus_unmap_resource, nexus_unmap_resource), DEVMETHOD(bus_setup_intr, nexus_setup_intr), DEVMETHOD(bus_teardown_intr, nexus_teardown_intr), DEVMETHOD(bus_suspend_intr, nexus_suspend_intr), DEVMETHOD(bus_resume_intr, nexus_resume_intr), #ifdef SMP DEVMETHOD(bus_bind_intr, nexus_bind_intr), #endif DEVMETHOD(bus_config_intr, nexus_config_intr), DEVMETHOD(bus_describe_intr, nexus_describe_intr), DEVMETHOD(bus_get_resource_list, nexus_get_reslist), DEVMETHOD(bus_set_resource, nexus_set_resource), DEVMETHOD(bus_get_resource, nexus_get_resource), DEVMETHOD(bus_delete_resource, nexus_delete_resource), DEVMETHOD(bus_get_cpus, nexus_get_cpus), /* pcib interface */ #if defined(DEV_APIC) && defined(DEV_PCI) DEVMETHOD(pcib_alloc_msi, nexus_alloc_msi), DEVMETHOD(pcib_release_msi, nexus_release_msi), DEVMETHOD(pcib_alloc_msix, nexus_alloc_msix), DEVMETHOD(pcib_release_msix, nexus_release_msix), DEVMETHOD(pcib_map_msi, nexus_map_msi), #endif { 0, 0 } }; DEFINE_CLASS_0(nexus, nexus_driver, nexus_methods, 1); static devclass_t nexus_devclass; DRIVER_MODULE(nexus, root, nexus_driver, nexus_devclass, 0, 0); static int nexus_probe(device_t dev) { device_quiet(dev); /* suppress attach message for neatness */ return (BUS_PROBE_GENERIC); } void nexus_init_resources(void) { int irq; /* * XXX working notes: * * - IRQ resource creation should be moved to the PIC/APIC driver. * - DRQ resource creation should be moved to the DMAC driver. * - The above should be sorted to probe earlier than any child buses. * * - Leave I/O and memory creation here, as child probes may need them. * (especially eg. ACPI) */ /* * IRQ's are on the mainboard on old systems, but on the ISA part * of PCI->ISA bridges. There would be multiple sets of IRQs on * multi-ISA-bus systems. PCI interrupts are routed to the ISA * component, so in a way, PCI can be a partial child of an ISA bus(!). * APIC interrupts are global though. */ irq_rman.rm_start = 0; irq_rman.rm_type = RMAN_ARRAY; irq_rman.rm_descr = "Interrupt request lines"; irq_rman.rm_end = num_io_irqs - 1; if (rman_init(&irq_rman)) panic("nexus_init_resources irq_rman"); /* * We search for regions of existing IRQs and add those to the IRQ * resource manager. */ for (irq = 0; irq < num_io_irqs; irq++) if (intr_lookup_source(irq) != NULL) if (rman_manage_region(&irq_rman, irq, irq) != 0) panic("nexus_init_resources irq_rman add"); /* * ISA DMA on PCI systems is implemented in the ISA part of each * PCI->ISA bridge and the channels can be duplicated if there are * multiple bridges. (eg: laptops with docking stations) */ drq_rman.rm_start = 0; drq_rman.rm_end = 7; drq_rman.rm_type = RMAN_ARRAY; drq_rman.rm_descr = "DMA request lines"; /* XXX drq 0 not available on some machines */ if (rman_init(&drq_rman) || rman_manage_region(&drq_rman, drq_rman.rm_start, drq_rman.rm_end)) panic("nexus_init_resources drq_rman"); /* * However, IO ports and Memory truely are global at this level, * as are APIC interrupts (however many IO APICS there turn out * to be on large systems..) */ port_rman.rm_start = 0; port_rman.rm_end = 0xffff; port_rman.rm_type = RMAN_ARRAY; port_rman.rm_descr = "I/O ports"; if (rman_init(&port_rman) || rman_manage_region(&port_rman, 0, 0xffff)) panic("nexus_init_resources port_rman"); mem_rman.rm_start = 0; mem_rman.rm_end = cpu_getmaxphyaddr(); mem_rman.rm_type = RMAN_ARRAY; mem_rman.rm_descr = "I/O memory addresses"; if (rman_init(&mem_rman) || rman_manage_region(&mem_rman, 0, mem_rman.rm_end)) panic("nexus_init_resources mem_rman"); } static int nexus_attach(device_t dev) { nexus_init_resources(); bus_generic_probe(dev); /* * Explicitly add the legacy0 device here. Other platform * types (such as ACPI), use their own nexus(4) subclass * driver to override this routine and add their own root bus. */ if (BUS_ADD_CHILD(dev, 10, "legacy", 0) == NULL) panic("legacy: could not attach"); bus_generic_attach(dev); return 0; } static int nexus_print_all_resources(device_t dev) { struct nexus_device *ndev = DEVTONX(dev); struct resource_list *rl = &ndev->nx_resources; int retval = 0; if (STAILQ_FIRST(rl)) retval += printf(" at"); retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#jx"); retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#jx"); retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%jd"); return retval; } static int nexus_print_child(device_t bus, device_t child) { int retval = 0; retval += bus_print_child_header(bus, child); retval += nexus_print_all_resources(child); if (device_get_flags(child)) retval += printf(" flags %#x", device_get_flags(child)); retval += printf("\n"); return (retval); } static device_t nexus_add_child(device_t bus, u_int order, const char *name, int unit) { device_t child; struct nexus_device *ndev; ndev = malloc(sizeof(struct nexus_device), M_NEXUSDEV, M_NOWAIT|M_ZERO); if (!ndev) return(0); resource_list_init(&ndev->nx_resources); child = device_add_child_ordered(bus, order, name, unit); /* should we free this in nexus_child_detached? */ device_set_ivars(child, ndev); return(child); } static struct rman * nexus_rman(int type) { switch (type) { case SYS_RES_IRQ: return (&irq_rman); case SYS_RES_DRQ: return (&drq_rman); case SYS_RES_IOPORT: return (&port_rman); case SYS_RES_MEMORY: return (&mem_rman); default: return (NULL); } } /* * Allocate a resource on behalf of child. NB: child is usually going to be a * child of one of our descendants, not a direct child of nexus0. * (Exceptions include npx.) */ static struct resource * nexus_alloc_resource(device_t bus, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct nexus_device *ndev = DEVTONX(child); struct resource *rv; struct resource_list_entry *rle; struct rman *rm; int needactivate = flags & RF_ACTIVE; /* * If this is an allocation of the "default" range for a given * RID, and we know what the resources for this device are * (ie. they aren't maintained by a child bus), then work out * the start/end values. */ if (RMAN_IS_DEFAULT_RANGE(start, end) && (count == 1)) { if (device_get_parent(child) != bus || ndev == NULL) return(NULL); rle = resource_list_find(&ndev->nx_resources, type, *rid); if (rle == NULL) return(NULL); start = rle->start; end = rle->end; count = rle->count; } flags &= ~RF_ACTIVE; rm = nexus_rman(type); if (rm == NULL) return (NULL); rv = rman_reserve_resource(rm, start, end, count, flags, child); if (rv == NULL) return 0; rman_set_rid(rv, *rid); if (needactivate) { if (bus_activate_resource(child, type, *rid, rv)) { rman_release_resource(rv); return 0; } } return rv; } static int nexus_adjust_resource(device_t bus, device_t child, int type, struct resource *r, rman_res_t start, rman_res_t end) { struct rman *rm; rm = nexus_rman(type); if (rm == NULL) return (ENXIO); if (!rman_is_region_manager(r, rm)) return (EINVAL); return (rman_adjust_resource(r, start, end)); } static int nexus_activate_resource(device_t bus, device_t child, int type, int rid, struct resource *r) { struct resource_map map; int error; error = rman_activate_resource(r); if (error != 0) return (error); if (!(rman_get_flags(r) & RF_UNMAPPED) && (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT)) { error = nexus_map_resource(bus, child, type, r, NULL, &map); if (error) { rman_deactivate_resource(r); return (error); } rman_set_mapping(r,&map); } return (0); } static int nexus_deactivate_resource(device_t bus, device_t child, int type, int rid, struct resource *r) { struct resource_map map; int error; error = rman_deactivate_resource(r); if (error) return (error); if (!(rman_get_flags(r) & RF_UNMAPPED) && (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT)) { rman_get_mapping(r, &map); nexus_unmap_resource(bus, child, type, r, &map); } return (0); } static int nexus_map_resource(device_t bus, device_t child, int type, struct resource *r, struct resource_map_request *argsp, struct resource_map *map) { struct resource_map_request args; rman_res_t end, length, start; /* Resources must be active to be mapped. */ if (!(rman_get_flags(r) & RF_ACTIVE)) return (ENXIO); /* Mappings are only supported on I/O and memory resources. */ switch (type) { case SYS_RES_IOPORT: case SYS_RES_MEMORY: break; default: return (EINVAL); } resource_init_map_request(&args); if (argsp != NULL) bcopy(argsp, &args, imin(argsp->size, args.size)); start = rman_get_start(r) + args.offset; if (args.length == 0) length = rman_get_size(r); else length = args.length; end = start + length - 1; if (start > rman_get_end(r) || start < rman_get_start(r)) return (EINVAL); if (end > rman_get_end(r) || end < start) return (EINVAL); /* * If this is a memory resource, map it into the kernel. */ switch (type) { case SYS_RES_IOPORT: map->r_bushandle = start; map->r_bustag = X86_BUS_SPACE_IO; map->r_size = length; map->r_vaddr = NULL; break; case SYS_RES_MEMORY: map->r_vaddr = pmap_mapdev_attr(start, length, args.memattr); map->r_bustag = X86_BUS_SPACE_MEM; map->r_size = length; /* * The handle is the virtual address. */ map->r_bushandle = (bus_space_handle_t)map->r_vaddr; break; } return (0); } static int nexus_unmap_resource(device_t bus, device_t child, int type, struct resource *r, struct resource_map *map) { /* * If this is a memory resource, unmap it. */ switch (type) { case SYS_RES_MEMORY: pmap_unmapdev((vm_offset_t)map->r_vaddr, map->r_size); /* FALLTHROUGH */ case SYS_RES_IOPORT: break; default: return (EINVAL); } return (0); } static int nexus_release_resource(device_t bus, device_t child, int type, int rid, struct resource *r) { if (rman_get_flags(r) & RF_ACTIVE) { int error = bus_deactivate_resource(child, type, rid, r); if (error) return error; } return (rman_release_resource(r)); } /* * Currently this uses the really grody interface from kern/kern_intr.c * (which really doesn't belong in kern/anything.c). Eventually, all of * the code in kern_intr.c and machdep_intr.c should get moved here, since * this is going to be the official interface. */ static int nexus_setup_intr(device_t bus, device_t child, struct resource *irq, int flags, driver_filter_t filter, void (*ihand)(void *), void *arg, void **cookiep) { int error, domain; /* somebody tried to setup an irq that failed to allocate! */ if (irq == NULL) panic("nexus_setup_intr: NULL irq resource!"); *cookiep = NULL; if ((rman_get_flags(irq) & RF_SHAREABLE) == 0) flags |= INTR_EXCL; /* * We depend here on rman_activate_resource() being idempotent. */ error = rman_activate_resource(irq); if (error) return (error); if (bus_get_domain(child, &domain) != 0) domain = 0; error = intr_add_handler(device_get_nameunit(child), rman_get_start(irq), filter, ihand, arg, flags, cookiep, domain); if (error == 0) rman_set_irq_cookie(irq, *cookiep); return (error); } static int nexus_teardown_intr(device_t dev, device_t child, struct resource *r, void *ih) { int error; error = intr_remove_handler(ih); if (error == 0) rman_set_irq_cookie(r, NULL); return (error); } static int nexus_suspend_intr(device_t dev, device_t child, struct resource *irq) { return (intr_event_suspend_handler(rman_get_irq_cookie(irq))); } static int nexus_resume_intr(device_t dev, device_t child, struct resource *irq) { return (intr_event_resume_handler(rman_get_irq_cookie(irq))); } #ifdef SMP static int nexus_bind_intr(device_t dev, device_t child, struct resource *irq, int cpu) { return (intr_bind(rman_get_start(irq), cpu)); } #endif static int nexus_config_intr(device_t dev, int irq, enum intr_trigger trig, enum intr_polarity pol) { return (intr_config_intr(irq, trig, pol)); } static int nexus_describe_intr(device_t dev, device_t child, struct resource *irq, void *cookie, const char *descr) { return (intr_describe(rman_get_start(irq), cookie, descr)); } static struct resource_list * nexus_get_reslist(device_t dev, device_t child) { struct nexus_device *ndev = DEVTONX(child); return (&ndev->nx_resources); } static int nexus_set_resource(device_t dev, device_t child, int type, int rid, rman_res_t start, rman_res_t count) { struct nexus_device *ndev = DEVTONX(child); struct resource_list *rl = &ndev->nx_resources; /* XXX this should return a success/failure indicator */ resource_list_add(rl, type, rid, start, start + count - 1, count); return(0); } static int nexus_get_resource(device_t dev, device_t child, int type, int rid, rman_res_t *startp, rman_res_t *countp) { struct nexus_device *ndev = DEVTONX(child); struct resource_list *rl = &ndev->nx_resources; struct resource_list_entry *rle; rle = resource_list_find(rl, type, rid); if (!rle) return(ENOENT); if (startp) *startp = rle->start; if (countp) *countp = rle->count; return(0); } static void nexus_delete_resource(device_t dev, device_t child, int type, int rid) { struct nexus_device *ndev = DEVTONX(child); struct resource_list *rl = &ndev->nx_resources; resource_list_delete(rl, type, rid); } static int nexus_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize, cpuset_t *cpuset) { switch (op) { #ifdef SMP case INTR_CPUS: if (setsize != sizeof(cpuset_t)) return (EINVAL); *cpuset = intr_cpus; return (0); #endif default: return (bus_generic_get_cpus(dev, child, op, setsize, cpuset)); } } /* Called from the MSI code to add new IRQs to the IRQ rman. */ void nexus_add_irq(u_long irq) { if (rman_manage_region(&irq_rman, irq, irq) != 0) panic("%s: failed", __func__); } #if defined(DEV_APIC) && defined(DEV_PCI) static int nexus_alloc_msix(device_t pcib, device_t dev, int *irq) { return (msix_alloc(dev, irq)); } static int nexus_release_msix(device_t pcib, device_t dev, int irq) { return (msix_release(irq)); } static int nexus_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs) { return (msi_alloc(dev, count, maxcount, irqs)); } static int nexus_release_msi(device_t pcib, device_t dev, int count, int *irqs) { return (msi_release(irqs, count)); } static int nexus_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr, uint32_t *data) { return (msi_map(irq, addr, data)); } #endif /* DEV_APIC && DEV_PCI */ /* Placeholder for system RAM. */ static void ram_identify(driver_t *driver, device_t parent) { if (resource_disabled("ram", 0)) return; if (BUS_ADD_CHILD(parent, 0, "ram", 0) == NULL) panic("ram_identify"); } static int ram_probe(device_t dev) { device_quiet(dev); device_set_desc(dev, "System RAM"); return (0); } static int ram_attach(device_t dev) { struct bios_smap *smapbase, *smap, *smapend; struct resource *res; rman_res_t length; vm_paddr_t *p; caddr_t kmdp; uint32_t smapsize; int error, rid; /* Retrieve the system memory map from the loader. */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type(ELF_KERN_STR); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase != NULL) { smapsize = *((u_int32_t *)smapbase - 1); smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); rid = 0; for (smap = smapbase; smap < smapend; smap++) { if (smap->type != SMAP_TYPE_MEMORY || smap->length == 0) continue; if (smap->base > mem_rman.rm_end) continue; length = smap->base + smap->length > mem_rman.rm_end ? mem_rman.rm_end - smap->base : smap->length; error = bus_set_resource(dev, SYS_RES_MEMORY, rid, smap->base, length); if (error) panic( "ram_attach: resource %d failed set with %d", rid, error); res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 0); if (res == NULL) panic("ram_attach: resource %d failed to attach", rid); rid++; } return (0); } /* * If the system map is not available, fall back to using * dump_avail[]. We use the dump_avail[] array rather than * phys_avail[] for the memory map as phys_avail[] contains * holes for kernel memory, page 0, the message buffer, and * the dcons buffer. We test the end address in the loop * instead of the start since the start address for the first * segment is 0. */ for (rid = 0, p = dump_avail; p[1] != 0; rid++, p += 2) { if (p[0] > mem_rman.rm_end) break; length = (p[1] > mem_rman.rm_end ? mem_rman.rm_end : p[1]) - p[0]; error = bus_set_resource(dev, SYS_RES_MEMORY, rid, p[0], length); if (error) panic("ram_attach: resource %d failed set with %d", rid, error); res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 0); if (res == NULL) panic("ram_attach: resource %d failed to attach", rid); } return (0); } static device_method_t ram_methods[] = { /* Device interface */ DEVMETHOD(device_identify, ram_identify), DEVMETHOD(device_probe, ram_probe), DEVMETHOD(device_attach, ram_attach), { 0, 0 } }; static driver_t ram_driver = { "ram", ram_methods, 1, /* no softc */ }; static devclass_t ram_devclass; DRIVER_MODULE(ram, nexus, ram_driver, ram_devclass, 0, 0); #ifdef DEV_ISA /* * Placeholder which claims PnP 'devices' which describe system * resources. */ static struct isa_pnp_id sysresource_ids[] = { { 0x010cd041 /* PNP0c01 */, "System Memory" }, { 0x020cd041 /* PNP0c02 */, "System Resource" }, { 0 } }; static int sysresource_probe(device_t dev) { int result; if ((result = ISA_PNP_PROBE(device_get_parent(dev), dev, sysresource_ids)) <= 0) { device_quiet(dev); } return(result); } static int sysresource_attach(device_t dev) { return(0); } static device_method_t sysresource_methods[] = { /* Device interface */ DEVMETHOD(device_probe, sysresource_probe), DEVMETHOD(device_attach, sysresource_attach), DEVMETHOD(device_detach, bus_generic_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, bus_generic_suspend), DEVMETHOD(device_resume, bus_generic_resume), { 0, 0 } }; static driver_t sysresource_driver = { "sysresource", sysresource_methods, 1, /* no softc */ }; static devclass_t sysresource_devclass; DRIVER_MODULE(sysresource, isa, sysresource_driver, sysresource_devclass, 0, 0); ISA_PNP_INFO(sysresource_ids); #endif /* DEV_ISA */