Index: head/sys/amd64/amd64/machdep.c =================================================================== --- head/sys/amd64/amd64/machdep.c (revision 173360) +++ head/sys/amd64/amd64/machdep.c (revision 173361) @@ -1,1912 +1,1912 @@ /*- * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_atalk.h" #include "opt_atpic.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_ipx.h" #include "opt_isa.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_msgbuf.h" #include "opt_perfmon.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #ifndef KDB #error KDB must be enabled in order for DDB to work! #endif #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PERFMON #include #endif #include #ifdef SMP #include #endif #ifdef DEV_ATPIC #include #else #include #endif #include #include /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); extern u_int64_t hammer_time(u_int64_t, u_int64_t); extern void dblfault_handler(void); extern void printcpuinfo(void); /* XXX header file */ extern void identify_cpu(void); extern void panicifcpuunsupported(void); #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) static void cpu_startup(void *); static void get_fpcontext(struct thread *td, mcontext_t *mcp); static int set_fpcontext(struct thread *td, const mcontext_t *mcp); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) #ifdef DDB extern vm_offset_t ksym_start, ksym_end; #endif int _udatasel, _ucodesel, _ucode32sel; int cold = 1; long Maxmem = 0; long realmem = 0; /* * The number of PHYSMAP entries must be one less than the number of * PHYSSEG entries because the PHYSMAP entry that spans the largest * physical address that is accessible by ISA DMA is split into two * PHYSSEG entries. */ #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2) #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2) struct kva_md_info kmi; static struct trapframe proc0_tf; struct region_descriptor r_gdt, r_idt; struct pcpu __pcpu[MAXCPU]; struct mtx icu_lock; struct mem_range_softc mem_range_softc; static void cpu_startup(dummy) void *dummy; { /* * Good {morning,afternoon,evening,night}. */ startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif printf("usable memory = %ju (%ju MB)\n", ptoa((uintmax_t)physmem), ptoa((uintmax_t)physmem) / 1048576); realmem = Maxmem; /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size; size = phys_avail[indx + 1] - phys_avail[indx]; printf( "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", (uintmax_t)phys_avail[indx], (uintmax_t)phys_avail[indx + 1] - 1, (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", ptoa((uintmax_t)cnt.v_free_count), ptoa((uintmax_t)cnt.v_free_count) / 1048576); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); cpu_setregs(); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; char *sp; struct trapframe *regs; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ get_fpcontext(td, &sf.sf_uc.uc_mcontext); fpstate_drop(td); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct sigframe); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128; /* Align to 16 bytes. */ sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ regs->tf_rdi = sig; /* arg 1 in %rdi */ regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ } else { /* Old FreeBSD-style arguments. */ regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (long)sfp; regs->tf_rip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); regs->tf_rflags &= ~PSL_T; regs->tf_cs = _ucodesel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. * * MPSAFE */ int sigreturn(td, uap) struct thread *td; struct sigreturn_args /* { const struct __ucontext *sigcntxp; } */ *uap; { ucontext_t uc; struct proc *p = td->td_proc; struct trapframe *regs; const ucontext_t *ucp; long rflags; int cs, error, ret; ksiginfo_t ksi; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) return (error); ucp = &uc; regs = td->td_frame; rflags = ucp->uc_mcontext.mc_rflags; /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_rflags for faults. Debuggers * should sometimes set it there too. tf_rflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { printf("sigreturn: rflags = 0x%lx\n", rflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { printf("sigreturn: cs = 0x%x\n", cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } ret = set_fpcontext(td, &ucp->uc_mcontext); if (ret != 0) return (ret); bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); PROC_LOCK(p); #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif td->td_sigmask = ucp->uc_sigmask; SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); td->td_pcb->pcb_flags |= PCB_FULLCTX; return (EJUSTRETURN); } #ifdef COMPAT_FREEBSD4 int freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) { return sigreturn(td, (struct sigreturn_args *)uap); } #endif /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { register_t reg; uint64_t tsc1, tsc2; if (pcpu_find(cpu_id) == NULL || rate == NULL) return (EINVAL); /* If we're booting, trust the rate calibrated moments ago. */ if (cold) { *rate = tsc_freq; return (0); } #ifdef SMP /* Schedule ourselves on the indicated cpu. */ thread_lock(curthread); sched_bind(curthread, cpu_id); thread_unlock(curthread); #endif /* Calibrate by measuring a short delay. */ reg = intr_disable(); tsc1 = rdtsc(); DELAY(1000); tsc2 = rdtsc(); intr_restore(reg); #ifdef SMP thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); #endif /* * Calculate the difference in readings, convert to Mhz, and * subtract 0.5% of the total. Empirical testing has shown that * overhead in DELAY() works out to approximately this value. */ tsc2 -= tsc1; *rate = tsc2 * 1000 - tsc2 * 5; return (0); } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) __asm__ ("hlt"); } /* * Hook to idle the CPU when possible. In the SMP case we default to * off because a halted cpu will not currently pick up a new thread in the * run queue until the next timer tick. If turned on this will result in * approximately a 4.2% loss in real time performance in buildworld tests * (but improves user and sys times oddly enough), and saves approximately * 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3). * * XXX we need to have a cpu mask of idle cpus and generate an IPI or * otherwise generate some sort of interrupt to wake up cpus sitting in HLT. * Then we can have our cake and eat it too. * * XXX I'm turning it on for SMP as well by default for now. It seems to * help lock contention somewhat, and this is critical for HTT. -Peter */ static int cpu_idle_hlt = 1; TUNABLE_INT("machdep.cpu_idle_hlt", &cpu_idle_hlt); SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, &cpu_idle_hlt, 0, "Idle loop HLT enable"); static void cpu_idle_default(void) { /* * we must absolutely guarentee that hlt is the * absolute next instruction after sti or we * introduce a timing window. */ __asm __volatile("sti; hlt"); } /* * Note that we have to be careful here to avoid a race between checking * sched_runnable() and actually halting. If we don't do this, we may waste * the time between calling hlt and the next interrupt even though there * is a runnable process. */ void cpu_idle(void) { #ifdef SMP if (mp_grab_cpu_hlt()) return; #endif if (cpu_idle_hlt) { disable_intr(); if (sched_runnable()) enable_intr(); else (*cpu_idle_hook)(); } } /* Other subsystems (e.g., ACPI) can hook this later. */ void (*cpu_idle_hook)(void) = cpu_idle_default; /* * Clear registers on exec */ void exec_setregs(td, entry, stack, ps_strings) struct thread *td; u_long entry; u_long stack; u_long ps_strings; { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; critical_enter(); wrmsr(MSR_FSBASE, 0); wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; critical_exit(); load_ds(_udatasel); load_es(_udatasel); load_fs(_udatasel); load_gs(_udatasel); pcb->pcb_ds = _udatasel; pcb->pcb_es = _udatasel; pcb->pcb_fs = _udatasel; pcb->pcb_gs = _udatasel; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = entry; regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; regs->tf_rdi = stack; /* argv */ regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_cs = _ucodesel; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == PCPU_GET(curpcb)) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } pcb->pcb_flags &= ~PCB_DBREGS; } /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); } void cpu_setregs(void) { register_t cr0; cr0 = rcr0(); /* * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the * BSP. See the comments there about why we set them. */ cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; load_cr0(cr0); } /* * Initialize amd64 and configure to run kernel */ /* * Initialize segments & interrupt table */ struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ static char dblfault_stack[PAGE_SIZE] __aligned(16); struct amd64tss common_tss[MAXCPU]; /* software prototypes -- in more palatable form */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, /* long */ 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GCODE_SEL 1 Code Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_KPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 1, /* long */ 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GDATA_SEL 2 Data Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_KPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 1, /* long */ 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GUCODE32_SEL 3 32 bit Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, /* long */ 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, /* long */ 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GUCODE_SEL 5 64 bit Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 1, /* long */ 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPROC0_SEL 6 Proc 0 Tss Descriptor */ { 0x0, /* segment base address */ sizeof(struct amd64tss)-1,/* length - all address space */ SDT_SYSTSS, /* segment type */ SEL_KPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, /* long */ 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Actually, the TSS is a system descriptor which is double size */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, /* long */ 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GUGS32_SEL 8 32 bit GS Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, /* long */ 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; void setidt(idx, func, typ, dpl, ist) int idx; inthand_t *func; int typ; int dpl; int ist; { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (uintptr_t)func; ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); ip->gd_ist = ist; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((uintptr_t)func)>>16 ; } extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(dblfault), IDTVEC(fast_syscall), IDTVEC(fast_syscall32); void sdtossd(sd, ssd) struct user_segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_long = sd->sd_long; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } void ssdtosd(ssd, sd) struct soft_segment_descriptor *ssd; struct user_segment_descriptor *sd; { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_long = ssd->ssd_long; sd->sd_def32 = ssd->ssd_def32; sd->sd_gran = ssd->ssd_gran; } void ssdtosyssd(ssd, sd) struct soft_segment_descriptor *ssd; struct system_segment_descriptor *sd; { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_gran = ssd->ssd_gran; } #if !defined(DEV_ATPIC) && defined(DEV_ISA) #include u_int isa_irq_pending(void) { return (0); } #endif u_int basemem; /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * If we cannot accurately determine the physical memory map, then use * value from the 0xE801 call, and failing that, the RTC. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * XXX first should be vm_paddr_t. */ static void getmemsize(caddr_t kmdp, u_int64_t first) { int i, off, physmap_idx, pa_indx, da_indx; vm_paddr_t pa, physmap[PHYSMAP_SIZE]; u_long physmem_tunable; pt_entry_t *pte; struct bios_smap *smapbase, *smap, *smapend; u_int32_t smapsize; quad_t dcons_addr, dcons_size; bzero(physmap, sizeof(physmap)); basemem = 0; physmap_idx = 0; /* * get memory map from INT 15:E820, kindly supplied by the loader. * * subr_module.c says: * "Consumer may safely assume that size value precedes data." * ie: an int32_t immediately precedes smap. */ smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase == NULL) panic("No BIOS smap info from loader!"); smapsize = *((u_int32_t *)smapbase - 1); smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); for (smap = smapbase; smap < smapend; smap++) { if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%016lx len=%016lx\n", smap->type, smap->base, smap->length); if (smap->type != SMAP_TYPE_MEMORY) continue; if (smap->length == 0) continue; for (i = 0; i <= physmap_idx; i += 2) { if (smap->base < physmap[i + 1]) { if (boothowto & RB_VERBOSE) printf( "Overlapping or non-monotonic memory region, ignoring second region\n"); continue; } } if (smap->base == physmap[physmap_idx + 1]) { physmap[physmap_idx + 1] += smap->length; continue; } physmap_idx += 2; if (physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); break; } physmap[physmap_idx] = smap->base; physmap[physmap_idx + 1] = smap->base + smap->length; } /* * Find the 'base memory' segment for SMP */ basemem = 0; for (i = 0; i <= physmap_idx; i += 2) { if (physmap[i] == 0x00000000) { basemem = physmap[i + 1] / 1024; break; } } if (basemem == 0) panic("BIOS smap did not include a basemem segment!"); #ifdef SMP /* make hole for AP bootstrap code */ physmap[1] = mp_bootaddress(physmap[1] / 1024); #endif /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) Maxmem = atop(physmem_tunable); /* * Don't allow MAXMEM or hw.physmem to extend the amount of memory * in the system. */ if (Maxmem > atop(physmap[physmap_idx + 1])) Maxmem = atop(physmap[physmap_idx + 1]); if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %ldK\n", Maxmem * 4); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(&first); /* * Size up each available chunk of physical memory. */ physmap[0] = PAGE_SIZE; /* mask off page 0 */ pa_indx = 0; da_indx = 1; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; dump_avail[da_indx] = physmap[0]; pte = CMAP1; /* * Get dcons buffer address */ if (getenv_quad("dcons.addr", &dcons_addr) == 0 || getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ for (i = 0; i <= physmap_idx; i += 2) { vm_paddr_t end; end = ptoa((vm_paddr_t)Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad, full; int *ptr = (int *)CADDR1; full = FALSE; /* * block out kernel memory as not available. */ if (pa >= 0x100000 && pa < first) goto do_dump_avail; /* * block out dcons buffer */ if (dcons_addr > 0 && pa >= trunc_page(dcons_addr) && pa < dcons_addr + dcons_size) goto do_dump_avail; page_bad = FALSE; /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | PG_N; invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) page_bad = TRUE; /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) page_bad = TRUE; /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) page_bad = TRUE; /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) page_bad = TRUE; /* * Restore original value. */ *(int *)ptr = tmp; /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) continue; /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; full = TRUE; goto do_dump_avail; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; do_dump_avail: if (dump_avail[da_indx] == pa) { dump_avail[da_indx] += PAGE_SIZE; } else { da_indx++; if (da_indx == DUMP_AVAIL_ARRAY_END) { da_indx--; goto do_next; } dump_avail[da_indx++] = pa; /* start */ dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ } do_next: if (full) break; } } *pte = 0; invltlb(); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); /* Map the message buffer. */ for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + off); } u_int64_t hammer_time(u_int64_t modulep, u_int64_t physfree) { caddr_t kmdp; int gsel_tss, x; struct pcpu *pc; u_int64_t msr; char *env; thread0.td_kstack = physfree + KERNBASE; bzero((void *)thread0.td_kstack, KSTACK_PAGES * PAGE_SIZE); physfree += KSTACK_PAGES * PAGE_SIZE; thread0.td_pcb = (struct pcb *) (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; /* * This may be done better later if it gets more high level * components in it. If so just link td->td_proc here. */ - proc_linkup(&proc0, &thread0); + proc_linkup0(&proc0, &thread0); preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); preload_bootstrap_relocate(KERNBASE); kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + KERNBASE; #ifdef DDB ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); #endif /* Init basic tunables, hz etc */ init_param1(); /* * make gdt memory segments */ gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0]; for (x = 0; x < NGDT; x++) { if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) ssdtosd(&gdt_segs[x], &gdt[x]); } ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (long) gdt; lgdt(&r_gdt); pc = &__pcpu[0]; wrmsr(MSR_FSBASE, 0); /* User value */ wrmsr(MSR_GSBASE, (u_int64_t)pc); wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ pcpu_init(pc, 0, sizeof(struct pcpu)); PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); PCPU_SET(curpcb, thread0.td_pcb); PCPU_SET(tssp, &common_tss[0]); /* * Initialize mutexes. * * icu_lock: in order to allow an interrupt to occur in a critical * section, to set pcpu->ipending (etc...) properly, we * must be able to get the icu lock, so it can't be * under witness. */ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 1); setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (long) idt; lidt(&r_idt); /* * Initialize the i8254 before the console so that console * initialization can use DELAY(). */ i8254_init(); /* * Initialize the console before we print anything out. */ cninit(); #ifdef DEV_ISA #ifdef DEV_ATPIC elcr_probe(); atpic_startup(); #else /* Reset and mask the atpics and leave them shut down. */ atpic_reset(); /* * Point the ICU spurious interrupt vectors at the APIC spurious * interrupt handler. */ setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); #endif #else #error "have you forgotten the isa device?"; #endif kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter("Boot flags requested debugger"); #endif identify_cpu(); /* Final stage of CPU initialization */ initializecpu(); /* Initialize CPU registers */ /* make an initial tss so cpu can get interrupt stack on syscall! */ common_tss[0].tss_rsp0 = thread0.td_kstack + \ KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb); /* Ensure the stack is aligned to 16 bytes */ common_tss[0].tss_rsp0 &= ~0xFul; PCPU_SET(rsp0, common_tss[0].tss_rsp0); /* doublefault stack space, runs on ist1 */ common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; /* Set the IO permission bitmap (empty due to tss seg limit) */ common_tss[0].tss_iobase = sizeof(struct amd64tss); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); /* Set up the fast syscall stuff */ msr = rdmsr(MSR_EFER) | EFER_SCE; wrmsr(MSR_EFER, msr); wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); wrmsr(MSR_STAR, msr); wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); getmemsize(kmdp, physfree); init_param2(physmem); /* now running on new page tables, configured,and u/iom is accessible */ msgbufinit(msgbufp, MSGBUF_SIZE); fpuinit(); /* transfer to user mode */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); _udatasel = GSEL(GUDATA_SEL, SEL_UPL); _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; /* XXXKSE */ thread0.td_pcb->pcb_cr3 = KPML4phys; thread0.td_frame = &proc0_tf; env = getenv("kernelname"); if (env != NULL) strlcpy(kernelname, env, sizeof(kernelname)); /* Location of kernel stack for locore */ return ((u_int64_t)thread0.td_pcb); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = 0xffffffff; } void spinlock_enter(void) { struct thread *td; td = curthread; if (td->td_md.md_spinlock_count == 0) td->td_md.md_saved_flags = intr_disable(); td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; td = curthread; critical_exit(); td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) intr_restore(td->td_md.md_saved_flags); } /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_r12 = tf->tf_r12; pcb->pcb_r13 = tf->tf_r13; pcb->pcb_r14 = tf->tf_r14; pcb->pcb_r15 = tf->tf_r15; pcb->pcb_rbp = tf->tf_rbp; pcb->pcb_rbx = tf->tf_rbx; pcb->pcb_rip = tf->tf_rip; pcb->pcb_rsp = (ISPL(tf->tf_cs)) ? tf->tf_rsp : (long)(tf + 1) - 8; } int ptrace_set_pc(struct thread *td, unsigned long addr) { td->td_frame->tf_rip = addr; return (0); } int ptrace_single_step(struct thread *td) { td->td_frame->tf_rflags |= PSL_T; return (0); } int ptrace_clear_single_step(struct thread *td) { td->td_frame->tf_rflags &= ~PSL_T; return (0); } int fill_regs(struct thread *td, struct reg *regs) { struct trapframe *tp; tp = td->td_frame; regs->r_r15 = tp->tf_r15; regs->r_r14 = tp->tf_r14; regs->r_r13 = tp->tf_r13; regs->r_r12 = tp->tf_r12; regs->r_r11 = tp->tf_r11; regs->r_r10 = tp->tf_r10; regs->r_r9 = tp->tf_r9; regs->r_r8 = tp->tf_r8; regs->r_rdi = tp->tf_rdi; regs->r_rsi = tp->tf_rsi; regs->r_rbp = tp->tf_rbp; regs->r_rbx = tp->tf_rbx; regs->r_rdx = tp->tf_rdx; regs->r_rcx = tp->tf_rcx; regs->r_rax = tp->tf_rax; regs->r_rip = tp->tf_rip; regs->r_cs = tp->tf_cs; regs->r_rflags = tp->tf_rflags; regs->r_rsp = tp->tf_rsp; regs->r_ss = tp->tf_ss; return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *tp; register_t rflags; tp = td->td_frame; rflags = regs->r_rflags & 0xffffffff; if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_r15 = regs->r_r15; tp->tf_r14 = regs->r_r14; tp->tf_r13 = regs->r_r13; tp->tf_r12 = regs->r_r12; tp->tf_r11 = regs->r_r11; tp->tf_r10 = regs->r_r10; tp->tf_r9 = regs->r_r9; tp->tf_r8 = regs->r_r8; tp->tf_rdi = regs->r_rdi; tp->tf_rsi = regs->r_rsi; tp->tf_rbp = regs->r_rbp; tp->tf_rbx = regs->r_rbx; tp->tf_rdx = regs->r_rdx; tp->tf_rcx = regs->r_rcx; tp->tf_rax = regs->r_rax; tp->tf_rip = regs->r_rip; tp->tf_cs = regs->r_cs; tp->tf_rflags = rflags; tp->tf_rsp = regs->r_rsp; tp->tf_ss = regs->r_ss; td->td_pcb->pcb_flags |= PCB_FULLCTX; return (0); } /* XXX check all this stuff! */ /* externalize from sv_xmm */ static void fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) { struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* pcb -> fpregs */ bzero(fpregs, sizeof(*fpregs)); /* FPU control/status */ penv_fpreg->en_cw = penv_xmm->en_cw; penv_fpreg->en_sw = penv_xmm->en_sw; penv_fpreg->en_tw = penv_xmm->en_tw; penv_fpreg->en_opcode = penv_xmm->en_opcode; penv_fpreg->en_rip = penv_xmm->en_rip; penv_fpreg->en_rdp = penv_xmm->en_rdp; penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; /* FPU registers */ for (i = 0; i < 8; ++i) bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); /* SSE registers */ for (i = 0; i < 16; ++i) bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); } /* internalize from fpregs into sv_xmm */ static void set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) { struct envxmm *penv_xmm = &sv_xmm->sv_env; struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; int i; /* fpregs -> pcb */ /* FPU control/status */ penv_xmm->en_cw = penv_fpreg->en_cw; penv_xmm->en_sw = penv_fpreg->en_sw; penv_xmm->en_tw = penv_fpreg->en_tw; penv_xmm->en_opcode = penv_fpreg->en_opcode; penv_xmm->en_rip = penv_fpreg->en_rip; penv_xmm->en_rdp = penv_fpreg->en_rdp; penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; /* FPU registers */ for (i = 0; i < 8; ++i) bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); /* SSE registers */ for (i = 0; i < 16; ++i) bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); } /* externalize from td->pcb */ int fill_fpregs(struct thread *td, struct fpreg *fpregs) { fill_fpregs_xmm(&td->td_pcb->pcb_save, fpregs); return (0); } /* internalize to td->pcb */ int set_fpregs(struct thread *td, struct fpreg *fpregs) { set_fpregs_xmm(fpregs, &td->td_pcb->pcb_save); return (0); } /* * Get machine context. */ int get_mcontext(struct thread *td, mcontext_t *mcp, int flags) { struct trapframe *tp; tp = td->td_frame; PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(tp->tf_rsp); PROC_UNLOCK(curthread->td_proc); mcp->mc_r15 = tp->tf_r15; mcp->mc_r14 = tp->tf_r14; mcp->mc_r13 = tp->tf_r13; mcp->mc_r12 = tp->tf_r12; mcp->mc_r11 = tp->tf_r11; mcp->mc_r10 = tp->tf_r10; mcp->mc_r9 = tp->tf_r9; mcp->mc_r8 = tp->tf_r8; mcp->mc_rdi = tp->tf_rdi; mcp->mc_rsi = tp->tf_rsi; mcp->mc_rbp = tp->tf_rbp; mcp->mc_rbx = tp->tf_rbx; mcp->mc_rcx = tp->tf_rcx; mcp->mc_rflags = tp->tf_rflags; if (flags & GET_MC_CLEAR_RET) { mcp->mc_rax = 0; mcp->mc_rdx = 0; mcp->mc_rflags &= ~PSL_C; } else { mcp->mc_rax = tp->tf_rax; mcp->mc_rdx = tp->tf_rdx; } mcp->mc_rip = tp->tf_rip; mcp->mc_cs = tp->tf_cs; mcp->mc_rsp = tp->tf_rsp; mcp->mc_ss = tp->tf_ss; mcp->mc_len = sizeof(*mcp); get_fpcontext(td, mcp); return (0); } /* * Set machine context. * * However, we don't set any but the user modifiable flags, and we won't * touch the cs selector. */ int set_mcontext(struct thread *td, const mcontext_t *mcp) { struct trapframe *tp; long rflags; int ret; tp = td->td_frame; if (mcp->mc_len != sizeof(*mcp)) return (EINVAL); rflags = (mcp->mc_rflags & PSL_USERCHANGE) | (tp->tf_rflags & ~PSL_USERCHANGE); ret = set_fpcontext(td, mcp); if (ret != 0) return (ret); tp->tf_r15 = mcp->mc_r15; tp->tf_r14 = mcp->mc_r14; tp->tf_r13 = mcp->mc_r13; tp->tf_r12 = mcp->mc_r12; tp->tf_r11 = mcp->mc_r11; tp->tf_r10 = mcp->mc_r10; tp->tf_r9 = mcp->mc_r9; tp->tf_r8 = mcp->mc_r8; tp->tf_rdi = mcp->mc_rdi; tp->tf_rsi = mcp->mc_rsi; tp->tf_rbp = mcp->mc_rbp; tp->tf_rbx = mcp->mc_rbx; tp->tf_rdx = mcp->mc_rdx; tp->tf_rcx = mcp->mc_rcx; tp->tf_rax = mcp->mc_rax; tp->tf_rip = mcp->mc_rip; tp->tf_rflags = rflags; tp->tf_rsp = mcp->mc_rsp; tp->tf_ss = mcp->mc_ss; td->td_pcb->pcb_flags |= PCB_FULLCTX; return (0); } static void get_fpcontext(struct thread *td, mcontext_t *mcp) { mcp->mc_ownedfp = fpugetregs(td, (struct savefpu *)&mcp->mc_fpstate); mcp->mc_fpformat = fpuformat(); } static int set_fpcontext(struct thread *td, const mcontext_t *mcp) { struct savefpu *fpstate; if (mcp->mc_fpformat == _MC_FPFMT_NODEV) return (0); else if (mcp->mc_fpformat != _MC_FPFMT_XMM) return (EINVAL); else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) /* We don't care what state is left in the FPU or PCB. */ fpstate_drop(td); else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || mcp->mc_ownedfp == _MC_FPOWNED_PCB) { /* * XXX we violate the dubious requirement that fpusetregs() * be called with interrupts disabled. * XXX obsolete on trap-16 systems? */ fpstate = (struct savefpu *)&mcp->mc_fpstate; fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask; fpusetregs(td, fpstate); } else return (EINVAL); return (0); } void fpstate_drop(struct thread *td) { register_t s; s = intr_disable(); if (PCPU_GET(fpcurthread) == td) fpudrop(); /* * XXX force a full drop of the fpu. The above only drops it if we * owned it. * * XXX I don't much like fpugetregs()'s semantics of doing a full * drop. Dropping only to the pcb matches fnsave's behaviour. * We only need to drop to !PCB_INITDONE in sendsig(). But * sendsig() is the only caller of fpugetregs()... perhaps we just * have too many layers. */ curthread->td_pcb->pcb_flags &= ~PCB_FPUINITDONE; intr_restore(s); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; if (td == NULL) { dbregs->dr[0] = rdr0(); dbregs->dr[1] = rdr1(); dbregs->dr[2] = rdr2(); dbregs->dr[3] = rdr3(); dbregs->dr[6] = rdr6(); dbregs->dr[7] = rdr7(); } else { pcb = td->td_pcb; dbregs->dr[0] = pcb->pcb_dr0; dbregs->dr[1] = pcb->pcb_dr1; dbregs->dr[2] = pcb->pcb_dr2; dbregs->dr[3] = pcb->pcb_dr3; dbregs->dr[6] = pcb->pcb_dr6; dbregs->dr[7] = pcb->pcb_dr7; } dbregs->dr[4] = 0; dbregs->dr[5] = 0; dbregs->dr[8] = 0; dbregs->dr[9] = 0; dbregs->dr[10] = 0; dbregs->dr[11] = 0; dbregs->dr[12] = 0; dbregs->dr[13] = 0; dbregs->dr[14] = 0; dbregs->dr[15] = 0; return (0); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; int i; if (td == NULL) { load_dr0(dbregs->dr[0]); load_dr1(dbregs->dr[1]); load_dr2(dbregs->dr[2]); load_dr3(dbregs->dr[3]); load_dr6(dbregs->dr[6]); load_dr7(dbregs->dr[7]); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP or a general protection fault right here. * Upper bits of dr6 and dr7 must not be set */ for (i = 0; i < 4; i++) { if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) return (EINVAL); if (td->td_frame->tf_cs == _ucode32sel && DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) return (EINVAL); } if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || (dbregs->dr[7] & 0xffffffff00000000ul) != 0) return (EINVAL); pcb = td->td_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { /* dr0 is enabled */ if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { /* dr1 is enabled */ if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { /* dr2 is enabled */ if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { /* dr3 is enabled */ if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) return (EINVAL); } pcb->pcb_dr0 = dbregs->dr[0]; pcb->pcb_dr1 = dbregs->dr[1]; pcb->pcb_dr2 = dbregs->dr[2]; pcb->pcb_dr3 = dbregs->dr[3]; pcb->pcb_dr6 = dbregs->dr[6]; pcb->pcb_dr7 = dbregs->dr[7]; pcb->pcb_flags |= PCB_DBREGS; } return (0); } void reset_dbregs(void) { load_dr7(0); /* Turn off the control bits first */ load_dr0(0); load_dr1(0); load_dr2(0); load_dr3(0); load_dr6(0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(void) { u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */ u_int64_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return 0; } nbp = 0; dr6 = rdr6(); bp = dr6 & 0x0000000f; if (!bp) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return 0; } /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i = 0; i < nbp; i++) { if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { /* * addr[i] is in user space */ return nbp; } } /* * None of the breakpoints are in user space. */ return 0; } #ifdef KDB /* * Provide inb() and outb() as functions. They are normally only * available as macros calling inlined functions, thus cannot be * called from the debugger. * * The actual code is stolen from , and de-inlined. */ #undef inb #undef outb /* silence compiler warnings */ u_char inb(u_int); void outb(u_int, u_char); u_char inb(u_int port) { u_char data; /* * We use %%dx and not %1 here because i/o is done at %dx and not at * %edx, while gcc generates inferior code (movw instead of movl) * if we tell it to load (u_short) port. */ __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); return (data); } void outb(u_int port, u_char data) { u_char al; /* * Use an unnecessary assignment to help gcc's register allocator. * This make a large difference for gcc-1.40 and a tiny difference * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for * best results. gcc-2.6.0 can't handle this. */ al = data; __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); } #endif /* KDB */ Index: head/sys/amd64/amd64/pmap.c =================================================================== --- head/sys/amd64/amd64/pmap.c (revision 173360) +++ head/sys/amd64/amd64/pmap.c (revision 173361) @@ -1,3463 +1,3465 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_msgbuf.h" #include "opt_pmap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif #if !defined(PMAP_DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif #define PV_STATS #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static int nkpt; static int ndmpdp; static vm_paddr_t dmaplimit; vm_offset_t kernel_vm_end; pt_entry_t pg_nx; static u_int64_t KPTphys; /* phys addr of kernel level 1 */ static u_int64_t KPDphys; /* phys addr of kernel level 2 */ u_int64_t KPDPphys; /* phys addr of kernel level 3 */ u_int64_t KPML4phys; /* phys addr of kernel level 4 */ static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ /* * Data for the pv entry allocation mechanism */ static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; static int shpgperproc = PMAP_SHPGPERPROC; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = 0; caddr_t CADDR1 = 0; struct msgbuf *msgbufp = 0; /* * Crashdump maps. */ static caddr_t crashdumpmap; static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free); static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags); static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t* free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *); static vm_offset_t pmap_kmem_choose(vm_offset_t addr); CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); /* * Move the kernel virtual free pointer to the next * 2MB. This is used to help improve performance * by using a large (2MB) page for much of the kernel * (.text, .data, .bss) */ static vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); return newaddr; } /********************/ /* Inline functions */ /********************/ /* Return a non-clipped PD index for a given VA */ static __inline vm_pindex_t pmap_pde_pindex(vm_offset_t va) { return va >> PDRSHIFT; } /* Return various clipped indexes for a given VA */ static __inline vm_pindex_t pmap_pte_index(vm_offset_t va) { return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); } static __inline vm_pindex_t pmap_pde_index(vm_offset_t va) { return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); } static __inline vm_pindex_t pmap_pdpe_index(vm_offset_t va) { return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); } static __inline vm_pindex_t pmap_pml4e_index(vm_offset_t va) { return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); } /* Return a pointer to the PML4 slot that corresponds to a VA */ static __inline pml4_entry_t * pmap_pml4e(pmap_t pmap, vm_offset_t va) { if (!pmap) return NULL; return (&pmap->pm_pml4[pmap_pml4e_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) { pdp_entry_t *pdpe; pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); return (&pdpe[pmap_pdpe_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pdpe(pmap_t pmap, vm_offset_t va) { pml4_entry_t *pml4e; pml4e = pmap_pml4e(pmap, va); if (pml4e == NULL || (*pml4e & PG_V) == 0) return NULL; return (pmap_pml4e_to_pdpe(pml4e, va)); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) { pd_entry_t *pde; pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); return (&pde[pmap_pde_index(va)]); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pdpe = pmap_pdpe(pmap, va); if (pdpe == NULL || (*pdpe & PG_V) == 0) return NULL; return (pmap_pdpe_to_pde(pdpe, va)); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) { pt_entry_t *pte; pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); return (&pte[pmap_pte_index(va)]); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) return NULL; if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ return ((pt_entry_t *)pde); return (pmap_pde_to_pte(pde, va)); } static __inline pt_entry_t * pmap_pte_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *ptepde) { pd_entry_t *pde; pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) return NULL; *ptepde = *pde; if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ return ((pt_entry_t *)pde); return (pmap_pde_to_pte(pde, va)); } PMAP_INLINE pt_entry_t * vtopte(vm_offset_t va) { u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); return (PTmap + ((va >> PAGE_SHIFT) & mask)); } static __inline pd_entry_t * vtopde(vm_offset_t va) { u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); return (PDmap + ((va >> PDRSHIFT) & mask)); } static u_int64_t allocpages(vm_paddr_t *firstaddr, int n) { u_int64_t ret; ret = *firstaddr; bzero((void *)ret, n * PAGE_SIZE); *firstaddr += n * PAGE_SIZE; return (ret); } static void create_pagetables(vm_paddr_t *firstaddr) { int i; /* Allocate pages */ KPTphys = allocpages(firstaddr, NKPT); KPML4phys = allocpages(firstaddr, 1); KPDPphys = allocpages(firstaddr, NKPML4E); KPDphys = allocpages(firstaddr, NKPDPE); ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; if (ndmpdp < 4) /* Minimum 4GB of dirmap */ ndmpdp = 4; DMPDPphys = allocpages(firstaddr, NDMPML4E); DMPDphys = allocpages(firstaddr, ndmpdp); dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; /* Fill in the underlying page table pages */ /* Read-only from zero to physfree */ /* XXX not fully used, underneath 2M pages */ for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT; ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G; } /* Now map the page tables at their location within PTmap */ for (i = 0; i < NKPT; i++) { ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V; } /* Map from zero to end of allocations under 2M pages */ /* This replaces some of the KPTphys entries above */ for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT; ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G; } /* And connect up the PD to the PDP */ for (i = 0; i < NKPDPE; i++) { ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + (i << PAGE_SHIFT); ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U; } /* Now set up the direct map space using 2MB pages */ for (i = 0; i < NPDEPG * ndmpdp; i++) { ((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT; ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G; } /* And the direct map space's PDP */ for (i = 0; i < ndmpdp; i++) { ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (i << PAGE_SHIFT); ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U; } /* And recursively map PML4 to itself in order to get PTmap */ ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U; /* Connect the Direct Map slot up to the PML4 */ ((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys; ((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U; /* Connect the KVA slot up to the PML4 */ ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; } /* * Bootstrap the system enough to run with virtual memory. * * On amd64 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t *firstaddr) { vm_offset_t va; pt_entry_t *pte, *unused; /* * Create an initial set of page tables to run the kernel in. */ create_pagetables(firstaddr); virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; virtual_avail = pmap_kmem_choose(virtual_avail); virtual_end = VM_MAX_KERNEL_ADDRESS; /* XXX do %cr0 as well */ load_cr4(rcr4() | CR4_PGE | CR4_PSE); load_cr3(KPML4phys); /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys); kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * CMAP1 is only used for the memory test. */ SYSMAP(caddr_t, CMAP1, CADDR1, 1) /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) /* * msgbufp is used to map the system message buffer. */ SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE))) virtual_avail = va; *CMAP1 = 0; invltlb(); /* Initialize the PAT MSR. */ pmap_init_pat(); } /* * Setup the PAT MSR. */ void pmap_init_pat(void) { uint64_t pat_msr; /* Bail if this CPU doesn't implement PAT. */ if (!(cpu_feature & CPUID_PAT)) panic("no PAT??"); #ifdef PAT_WORKS /* * Leave the indices 0-3 at the default of WB, WT, UC, and UC-. * Program 4 and 5 as WP and WC. * Leave 6 and 7 as UC and UC-. */ pat_msr = rdmsr(MSR_PAT); pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5)); pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) | PAT_VALUE(5, PAT_WRITE_COMBINING); #else /* * Due to some Intel errata, we can only safely use the lower 4 * PAT entries. Thus, just replace PAT Index 2 with WC instead * of UC-. * * Intel Pentium III Processor Specification Update * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B * or Mode C Paging) * * Intel Pentium IV Processor Specification Update * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) */ pat_msr = rdmsr(MSR_PAT); pat_msr &= ~PAT_MASK(2); pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); #endif wrmsr(MSR_PAT, pat_msr); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_list_count = 0; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_high_water = 9 * (pv_entry_max / 10); } SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pmap_pventry_proc(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (error == 0 && req->newptr) { shpgperproc = (pv_entry_max - cnt.v_page_count) / maxproc; pv_entry_high_water = 9 * (pv_entry_max / 10); } return (error); } SYSCTL_PROC(_vm_pmap, OID_AUTO, pv_entry_max, CTLTYPE_INT|CTLFLAG_RW, &pv_entry_max, 0, pmap_pventry_proc, "IU", "Max number of PV entries"); static int pmap_shpgperproc_proc(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (error == 0 && req->newptr) { pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; pv_entry_high_water = 9 * (pv_entry_max / 10); } return (error); } SYSCTL_PROC(_vm_pmap, OID_AUTO, shpgperproc, CTLTYPE_INT|CTLFLAG_RW, &shpgperproc, 0, pmap_shpgperproc_proc, "IU", "Page share factor per proc"); /*************************************************** * Low level helper routines..... ***************************************************/ /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ static int pmap_cache_bits(int mode, boolean_t is_pde) { int pat_flag, pat_index, cache_bits; /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; /* If we don't support PAT, map extended modes to older ones. */ if (!(cpu_feature & CPUID_PAT)) { switch (mode) { case PAT_UNCACHEABLE: case PAT_WRITE_THROUGH: case PAT_WRITE_BACK: break; case PAT_UNCACHED: case PAT_WRITE_COMBINING: case PAT_WRITE_PROTECTED: mode = PAT_UNCACHEABLE; break; } } /* Map the caching mode to a PAT index. */ switch (mode) { #ifdef PAT_WORKS case PAT_UNCACHEABLE: pat_index = 3; break; case PAT_WRITE_THROUGH: pat_index = 1; break; case PAT_WRITE_BACK: pat_index = 0; break; case PAT_UNCACHED: pat_index = 2; break; case PAT_WRITE_COMBINING: pat_index = 5; break; case PAT_WRITE_PROTECTED: pat_index = 4; break; #else case PAT_UNCACHED: case PAT_UNCACHEABLE: case PAT_WRITE_PROTECTED: pat_index = 3; break; case PAT_WRITE_THROUGH: pat_index = 1; break; case PAT_WRITE_BACK: pat_index = 0; break; case PAT_WRITE_COMBINING: pat_index = 2; break; #endif default: panic("Unknown caching mode %d\n", mode); } /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_index & 0x4) cache_bits |= pat_flag; if (pat_index & 0x2) cache_bits |= PG_NC_PCD; if (pat_index & 0x1) cache_bits |= PG_NC_PWT; return (cache_bits); } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { u_int cpumask; u_int other_cpus; sched_pin(); if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { invlpg(va); smp_invlpg(va); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) invlpg(va); if (pmap->pm_active & other_cpus) smp_masked_invlpg(pmap->pm_active & other_cpus, va); } sched_unpin(); } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { u_int cpumask; u_int other_cpus; vm_offset_t addr; sched_pin(); if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); smp_invlpg_range(sva, eva); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); if (pmap->pm_active & other_cpus) smp_masked_invlpg_range(pmap->pm_active & other_cpus, sva, eva); } sched_unpin(); } void pmap_invalidate_all(pmap_t pmap) { u_int cpumask; u_int other_cpus; sched_pin(); if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { invltlb(); smp_invltlb(); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) invltlb(); if (pmap->pm_active & other_cpus) smp_masked_invltlb(pmap->pm_active & other_cpus); } sched_unpin(); } void pmap_invalidate_cache(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, invalidation functions. * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || pmap->pm_active) invlpg(va); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap || pmap->pm_active) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap || pmap->pm_active) invltlb(); } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } #endif /* !SMP */ /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME)); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t rtval; pt_entry_t *pte; pd_entry_t pde, *pdep; rtval = 0; PMAP_LOCK(pmap); pdep = pmap_pde(pmap, va); if (pdep != NULL) { pde = *pdep; if (pde) { if ((pde & PG_PS) != 0) { rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); PMAP_UNLOCK(pmap); return rtval; } pte = pmap_pde_to_pte(pdep, va); rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); } } PMAP_UNLOCK(pmap); return (rtval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde, *pdep; pt_entry_t pte; vm_page_t m; m = NULL; vm_page_lock_queues(); PMAP_LOCK(pmap); pdep = pmap_pde(pmap, va); if (pdep != NULL && (pde = *pdep)) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); vm_page_hold(m); } } else { pte = *pmap_pde_to_pte(pdep, va); if ((pte & PG_V) && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { m = PHYS_TO_VM_PAGE(pte & PG_FRAME); vm_page_hold(m); } } } vm_page_unlock_queues(); PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pd_entry_t *pde; vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { pde = vtopde(va); if (*pde & PG_PS) { pa = (*pde & PG_PS_FRAME) | (va & PDRMASK); } else { pa = *vtopte(va); pa = (pa & PG_FRAME) | (va & PAGE_MASK); } } return pa; } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | PG_G); } PMAP_INLINE void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0)); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, *pte; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { oldpte |= *pte; pte_store(pte, VM_PAGE_TO_PHYS(*ma) | PG_G | PG_RW | PG_V); pte++; ma++; } if ((oldpte & PG_V) != 0) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ static PMAP_INLINE void pmap_free_zero_pages(vm_page_t free) { vm_page_t m; while (free != NULL) { m = free; free = m->right; vm_page_free_zero(m); } } /* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ static PMAP_INLINE int pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free) { --m->wire_count; if (m->wire_count == 0) return _pmap_unwire_pte_hold(pmap, va, m, free); else return 0; } static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free) { vm_offset_t pteva; /* * unmap the page table page */ if (m->pindex >= (NUPDE + NUPDPE)) { /* PDP page */ pml4_entry_t *pml4; pml4 = pmap_pml4e(pmap, va); pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE)); *pml4 = 0; } else if (m->pindex >= NUPDE) { /* PD page */ pdp_entry_t *pdp; pdp = pmap_pdpe(pmap, va); pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE); *pdp = 0; } else { /* PTE page */ pd_entry_t *pd; pd = pmap_pde(pmap, va); pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex); *pd = 0; } --pmap->pm_stats.resident_count; if (m->pindex < NUPDE) { /* We just released a PT, unhold the matching PD */ vm_page_t pdpg; pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); pmap_unwire_pte_hold(pmap, va, pdpg, free); } if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { /* We just released a PD, unhold the matching PDP */ vm_page_t pdppg; pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); pmap_unwire_pte_hold(pmap, va, pdppg, free); } /* * Do an invltlb to make the invalidated mapping * take effect immediately. */ pmap_invalidate_page(pmap, pteva); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ m->right = *free; *free = m; atomic_subtract_int(&cnt.v_wire_count, 1); return 1; } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return 0; KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return pmap_unwire_pte_hold(pmap, va, mpte, free); } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys); pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ -void +int pmap_pinit(pmap_t pmap) { vm_page_t pml4pg; static vm_pindex_t color; PMAP_LOCK_INIT(pmap); /* * allocate the page directory page */ while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) VM_WAIT; pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); if ((pml4pg->flags & PG_ZERO) == 0) pagezero(pmap->pm_pml4); /* Wire in kernel global address entries. */ pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U; /* install self-referential address mapping entry(s) */ pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M; pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + + return (1); } /* * this routine is called if the page table page is not * mapped correctly. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags) { vm_page_t m, pdppg, pdpg; KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (flags & M_WAITOK) { PMAP_UNLOCK(pmap); vm_page_unlock_queues(); VM_WAIT; vm_page_lock_queues(); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; if (ptepindex >= (NUPDE + NUPDPE)) { pml4_entry_t *pml4; vm_pindex_t pml4index; /* Wire up a new PDPE page */ pml4index = ptepindex - (NUPDE + NUPDPE); pml4 = &pmap->pm_pml4[pml4index]; *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } else if (ptepindex >= NUPDE) { vm_pindex_t pml4index; vm_pindex_t pdpindex; pml4_entry_t *pml4; pdp_entry_t *pdp; /* Wire up a new PDE page */ pdpindex = ptepindex - NUPDE; pml4index = pdpindex >> NPML4EPGSHIFT; pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pdp, recurse */ if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, flags) == NULL) { --m->wire_count; vm_page_free(m); return (NULL); } } else { /* Add reference to pdp page */ pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); pdppg->wire_count++; } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); /* Now find the pdp page */ pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } else { vm_pindex_t pml4index; vm_pindex_t pdpindex; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pd; /* Wire up a new PTE page */ pdpindex = ptepindex >> NPDPEPGSHIFT; pml4index = pdpindex >> NPML4EPGSHIFT; /* First, find the pdp and check that its valid. */ pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, flags) == NULL) { --m->wire_count; vm_page_free(m); return (NULL); } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; } else { pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; if ((*pdp & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, flags) == NULL) { --m->wire_count; vm_page_free(m); return (NULL); } } else { /* Add reference to the pd page */ pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); pdpg->wire_count++; } } pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); /* Now we know where the page directory page is */ pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } return m; } static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags) { vm_pindex_t pdpindex, ptepindex; pdp_entry_t *pdpe; vm_page_t pdpg; KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK")); retry: pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { /* Add a reference to the pd page. */ pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); pdpg->wire_count++; } else { /* Allocate a pd page. */ ptepindex = pmap_pde_pindex(va); pdpindex = ptepindex >> NPDPEPGSHIFT; pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags); if (pdpg == NULL && (flags & M_WAITOK)) goto retry; } return (pdpg); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) { vm_pindex_t ptepindex; pd_entry_t *pd; vm_page_t m, free; KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); retry: /* * Get the page directory entry */ pd = pmap_pde(pmap, va); /* * This supports switching from a 2MB page to a * normal 4K page. */ if (pd != 0 && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { *pd = 0; pd = 0; pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; free = NULL; pmap_unuse_pt(pmap, va, *pmap_pdpe(pmap, va), &free); pmap_invalidate_all(kernel_pmap); pmap_free_zero_pages(free); } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (pd != 0 && (*pd & PG_V) != 0) { m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & M_WAITOK)) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME); pmap->pm_pml4[KPML4I] = 0; /* KVA */ pmap->pm_pml4[DMPML4I] = 0; /* Direct Map */ pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ m->wire_count--; atomic_subtract_int(&cnt.v_wire_count, 1); vm_page_free_zero(m); PMAP_LOCK_DESTROY(pmap); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *pde, newpdir; pdp_entry_t newpdp; mtx_assert(&kernel_map->system_mtx, MA_OWNED); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); nkpt++; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } addr = roundup2(addr, PAGE_SIZE * NPTEPG); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { pde = pmap_pde(kernel_pmap, kernel_vm_end); if (pde == NULL) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, nkpt, VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); newpdp = (pdp_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M); *pmap_pdpe(kernel_pmap, kernel_vm_end) = newpdp; continue; /* try again */ } if ((*pde & PG_V) != 0) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } /* * This index is bogus, but out of the way */ nkpg = vm_page_alloc(NULL, nkpt, VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M); *pmap_pde(kernel_pmap, kernel_vm_end) = newpdir; kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); static int pmap_collect_inactive, pmap_collect_active; SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0, "Current number times pmap_collect called on inactive queue"); SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0, "Current number times pmap_collect called on active queue"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. This is normally called to * unmap inactive pages, and if necessary, active pages. */ static void pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) { pd_entry_t ptepde; pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t next_pv, pv; vm_offset_t va; vm_page_t m, free; TAILQ_FOREACH(m, &vpq->pl, pageq) { if (m->hold_count || m->busy) continue; TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) continue; pmap->pm_stats.resident_count--; pte = pmap_pte_pde(pmap, va, &ptepde); if (pte == NULL) { panic("null pte in pmap_collect"); } tpte = pte_load_clear(pte); KASSERT((tpte & PG_W) == 0, ("pmap_collect: wired pte %#lx", tpte)); if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); if (tpte & PG_M) { KASSERT((tpte & PG_RW), ("pmap_collect: modified page not writable: va: %#lx, pte: %#lx", va, tpte)); vm_page_dirty(m); } free = NULL; pmap_unuse_pt(pmap, va, ptepde, &free); pmap_invalidate_page(pmap, va); pmap_free_zero_pages(free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); m->md.pv_list_count--; free_pv_entry(pmap, pv); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } } } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { vm_page_t m; struct pv_chunk *pc; int idx, field, bit; mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; /* move to head of list */ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) return; PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_free(m); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, int try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; static vm_pindex_t colour; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entry_max sysctl.\n"); pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfq(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(pv_entry_spare--); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, colour, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ); if (m == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } /* * Reclaim pv entries: At first, destroy mappings to inactive * pages. After that, if a pv chunk entry is still needed, * destroy mappings to active pages. */ PV_STAT(pmap_collect_inactive++); pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]); m = vm_page_alloc(NULL, colour, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ); if (m == NULL) { PV_STAT(pmap_collect_active++); pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]); m = vm_page_alloc(NULL, colour, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ); if (m == NULL) panic("get_pv_entry: increase vm.pmap.shpgperproc"); } } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); colour++; dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) break; } KASSERT(pv != NULL, ("pmap_remove_entry: pv not found")); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); free_pv_entry(pmap, pv); } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; return (TRUE); } else return (FALSE); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free) { pt_entry_t oldpte; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if (oldpte & PG_M) { KASSERT((oldpte & PG_RW), ("pmap_remove_pte: modified page not writable: va: %#lx, pte: %#lx", va, oldpte)); vm_page_dirty(m); } if (oldpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, ptepde, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free) { pt_entry_t *pte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((*pde & PG_V) == 0) return; pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) return; pmap_remove_pte(pmap, pte, va, *pde, free); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t *pte; vm_page_t free = NULL; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; vm_page_lock_queues(); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { pde = pmap_pde(pmap, sva); if (pde && (*pde & PG_PS) == 0) { pmap_remove_page(pmap, sva, pde, &free); goto out; } } for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; continue; } /* * Calculate index for next page table. */ va_next = (sva + NBPDR) & ~PDRMASK; pde = pmap_pdpe_to_pde(pdpe, sva); ptpaddr = *pde; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { *pde = 0; pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; pmap_unuse_pt(pmap, sva, *pdpe, &free); anyvalid = 1; continue; } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (*pte == 0) continue; /* * The TLB entry for a PG_G mapping is invalidated * by pmap_remove_pte(). */ if ((*pte & PG_G) == 0) anyvalid = 1; if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free)) break; } } out: if (anyvalid) pmap_invalidate_all(pmap); vm_page_unlock_queues(); PMAP_UNLOCK(pmap); pmap_free_zero_pages(free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pt_entry_t *pte, tpte; pd_entry_t ptepde; vm_page_t free; #if defined(PMAP_DIAGNOSTIC) /* * XXX This makes pmap_remove_all() illegal for non-managed pages! */ if (m->flags & PG_FICTITIOUS) { panic("pmap_remove_all: illegal for unmanaged page, va: 0x%lx", VM_PAGE_TO_PHYS(m)); } #endif mtx_assert(&vm_page_queue_mtx, MA_OWNED); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pte = pmap_pte_pde(pmap, pv->pv_va, &ptepde); if (pte == NULL) { panic("null pte in pmap_remove_all"); } tpte = pte_load_clear(pte); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { KASSERT((tpte & PG_RW), ("pmap_remove_all: modified page not writable: va: %#lx, pte: %#lx", pv->pv_va, tpte)); vm_page_dirty(m); } free = NULL; pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); pmap_invalidate_page(pmap, pv->pv_va); pmap_free_zero_pages(free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_flag_clear(m, PG_WRITEABLE); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t *pte; int anychanged; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; anychanged = 0; vm_page_lock_queues(); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; continue; } va_next = (sva + NBPDR) & ~PDRMASK; pde = pmap_pdpe_to_pde(pdpe, sva); ptpaddr = *pde; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { if ((prot & VM_PROT_WRITE) == 0) *pde &= ~(PG_M|PG_RW); if ((prot & VM_PROT_EXECUTE) == 0) *pde |= pg_nx; anychanged = 1; continue; } if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { pt_entry_t obits, pbits; vm_page_t m; retry: obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if (pbits & PG_MANAGED) { m = NULL; if (pbits & PG_A) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_flag_set(m, PG_REFERENCED); pbits &= ~PG_A; } if ((pbits & PG_M) != 0) { if (m == NULL) m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } } if ((prot & VM_PROT_WRITE) == 0) pbits &= ~(PG_RW | PG_M); if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; if (pbits != obits) { if (!atomic_cmpset_long(pte, obits, pbits)) goto retry; if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = 1; } } } if (anychanged) pmap_invalidate_all(pmap); vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { vm_paddr_t pa; pd_entry_t *pde; pt_entry_t *pte; vm_paddr_t opa; pt_entry_t origpte, newpte; vm_page_t mpte, om; boolean_t invlva; va = trunc_page(va); #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va); #endif mpte = NULL; vm_page_lock_queues(); PMAP_LOCK(pmap); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va, M_WAITOK); } #if 0 && defined(PMAP_DIAGNOSTIC) else { pd_entry_t *pdeaddr = pmap_pde(pmap, va); origpte = *pdeaddr; if ((origpte & PG_V) == 0) { panic("pmap_enter: invalid kernel page table page, pde=%p, va=%p\n", origpte, va); } } #endif pde = pmap_pde(pmap, va); if (pde != NULL) { if ((*pde & PG_PS) != 0) panic("pmap_enter: attempted pmap_enter on 2MB page"); pte = pmap_pde_to_pte(pde, va); } else pte = NULL; /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) panic("pmap_enter: invalid page directory va=%#lx\n", va); pa = VM_PAGE_TO_PHYS(m); om = NULL; origpte = *pte; opa = origpte & PG_FRAME; /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; /* * Remove extra pte reference */ if (mpte) mpte->wire_count--; /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { om = m; pa |= PG_MANAGED; } goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (origpte & PG_W) pmap->pm_stats.wired_count--; if (origpte & PG_MANAGED) { om = PHYS_TO_VM_PAGE(opa); pmap_remove_entry(pmap, om, va); } if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); pmap_insert_entry(pmap, va, m); pa |= PG_MANAGED; } /* * Increment counters */ if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (pt_entry_t)(pa | PG_V); if ((prot & VM_PROT_WRITE) != 0) { newpte |= PG_RW; vm_page_flag_set(m, PG_WRITEABLE); } if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if (wired) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= PG_G; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { if (origpte & PG_V) { invlva = FALSE; origpte = pte_load_store(pte, newpte | PG_A); if (origpte & PG_A) { if (origpte & PG_MANAGED) vm_page_flag_set(om, PG_REFERENCED); if (opa != VM_PAGE_TO_PHYS(m) || ((origpte & PG_NX) == 0 && (newpte & PG_NX))) invlva = TRUE; } if (origpte & PG_M) { KASSERT((origpte & PG_RW), ("pmap_enter: modified page not writable: va: %#lx, pte: %#lx", va, origpte)); if ((origpte & PG_MANAGED) != 0) vm_page_dirty(om); if ((newpte & PG_RW) == 0) invlva = TRUE; } if (invlva) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte | PG_A); } vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED); psize = atop(end - start); mpte = NULL; m = m_start; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot, mpte); m = TAILQ_NEXT(m, listq); } PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { PMAP_LOCK(pmap); (void) pmap_enter_quick_locked(pmap, va, m, prot, NULL); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { vm_page_t free; pt_entry_t *pte; vm_paddr_t pa; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t ptepindex; pd_entry_t *ptepa; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap_pde(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa && (*ptepa & PG_V) != 0) { if (*ptepa & PG_PS) panic("pmap_enter_quick: unexpected mapping into 2MB page"); mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); mpte->wire_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, M_NOWAIT); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpte != NULL) { free = NULL; if (pmap_unwire_pte_hold(pmap, va, mpte, &free)) { pmap_invalidate_page(pmap, va); pmap_free_zero_pages(free); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m); if ((prot & VM_PROT_EXECUTE) == 0) pa |= pg_nx; /* * Now validate mapping with RO protection */ if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); return mpte; } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { vm_offset_t va; vm_page_t p, pdpg; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT(object->type == OBJT_DEVICE, ("pmap_object_init_pt: non-device object")); if (((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) { vm_page_t m[1]; pd_entry_t ptepa, *pde; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (pde != 0 && (*pde & PG_V) != 0) goto out; PMAP_UNLOCK(pmap); retry: p = vm_page_lookup(object, pindex); if (p != NULL) { if (vm_page_sleep_if_busy(p, FALSE, "init4p")) goto retry; } else { p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); if (p == NULL) return; m[0] = p; if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { vm_page_lock_queues(); vm_page_free(p); vm_page_unlock_queues(); return; } p = vm_page_lookup(object, pindex); vm_page_lock_queues(); vm_page_wakeup(p); vm_page_unlock_queues(); } ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; p->valid = VM_PAGE_BITS_ALL; PMAP_LOCK(pmap); for (va = addr; va < addr + size; va += NBPDR) { while ((pdpg = pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) { PMAP_UNLOCK(pmap); vm_page_lock_queues(); vm_page_busy(p); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); VM_WAIT; VM_OBJECT_LOCK(object); vm_page_lock_queues(); vm_page_wakeup(p); vm_page_unlock_queues(); PMAP_LOCK(pmap); } pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); pde = &pde[pmap_pde_index(va)]; if ((*pde & PG_V) == 0) { pde_store(pde, ptepa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; } else { pdpg->wire_count--; KASSERT(pdpg->wire_count > 0, ("pmap_object_init_pt: missing reference " "to page directory page, va: 0x%lx", va)); } ptepa += NBPDR; } pmap_invalidate_all(pmap); out: PMAP_UNLOCK(pmap); } } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) { pt_entry_t *pte; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ PMAP_LOCK(pmap); pte = pmap_pte(pmap, va); if (wired && (*pte & PG_W) == 0) { pmap->pm_stats.wired_count++; atomic_set_long(pte, PG_W); } else if (!wired && (*pte & PG_W) != 0) { pmap->pm_stats.wired_count--; atomic_clear_long(pte, PG_W); } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { vm_page_t free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t va_next; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; vm_page_lock_queues(); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } for (addr = src_addr; addr < end_addr; addr = va_next) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpde, dstmpte, srcmpte; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t srcptepaddr, *pde; if (addr >= UPT_MIN_ADDRESS) panic("pmap_copy: invalid to pmap_copy page tables"); pml4e = pmap_pml4e(src_pmap, addr); if ((*pml4e & PG_V) == 0) { va_next = (addr + NBPML4) & ~PML4MASK; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, addr); if ((*pdpe & PG_V) == 0) { va_next = (addr + NBPDP) & ~PDPMASK; continue; } va_next = (addr + NBPDR) & ~PDRMASK; pde = pmap_pdpe_to_pde(pdpe, addr); srcptepaddr = *pde; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT); if (dstmpde == NULL) break; pde = (pd_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde)); pde = &pde[pmap_pde_index(addr)]; if (*pde == 0) { *pde = srcptepaddr & ~PG_W; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; } else dstmpde->wire_count--; continue; } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); if (srcmpte->wire_count == 0) panic("pmap_copy: source page table page is unused"); if (va_next > end_addr) va_next = end_addr; src_pte = vtopte(addr); while (addr < va_next) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { dstmpte = pmap_allocpte(dst_pmap, addr, M_NOWAIT); if (dstmpte == NULL) break; dst_pte = (pt_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); dst_pte = &dst_pte[pmap_pte_index(addr)]; if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); dst_pmap->pm_stats.resident_count++; } else { free = NULL; if (pmap_unwire_pte_hold(dst_pmap, addr, dstmpte, &free)) { pmap_invalidate_page(dst_pmap, addr); pmap_free_zero_pages(free); } } if (dstmpte->wire_count >= srcmpte->wire_count) break; } addr += PAGE_SIZE; src_pte++; } } vm_page_unlock_queues(); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { pv_entry_t pv; int loops = 0; if (m->flags & PG_FICTITIOUS) return FALSE; mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (PV_PMAP(pv) == pmap) { return TRUE; } loops++; if (loops >= 16) break; } return (FALSE); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pt_entry_t *pte, tpte; vm_page_t m, free = NULL; pv_entry_t pv; struct pv_chunk *pc, *npc; int field, idx; int64_t bit; uint64_t inuse, bitmask; int allfree; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } vm_page_lock_queues(); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = (~(pc->pc_map[field])) & pc_freemask[field]; while (inuse != 0) { bit = bsfq(inuse); bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = vtopte(pv->pv_va); tpte = *pte; if (tpte == 0) { printf( "TPTE at %p IS ZERO @ VA %08lx\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT(m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pmap->pm_stats.resident_count--; pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if (tpte & PG_M) vm_page_dirty(m); /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; m->md.pv_list_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); pmap_unuse_pt(pmap, pv->pv_va, *vtopde(pv->pv_va), &free); } } if (allfree) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_free(m); } } pmap_invalidate_all(pmap); vm_page_unlock_queues(); PMAP_UNLOCK(pmap); pmap_free_zero_pages(free); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rv = FALSE; if (m->flags & PG_FICTITIOUS) return (rv); mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); rv = (*pte & PG_M) != 0; PMAP_UNLOCK(pmap); if (rv) break; } return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (pde != NULL && (*pde & PG_V)) { pte = vtopte(addr); rv = (*pte & PG_V) == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pt_entry_t oldpte, *pte; if ((m->flags & PG_FICTITIOUS) != 0 || (m->flags & PG_WRITEABLE) == 0) return; mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); retry: oldpte = *pte; if (oldpte & PG_RW) { if (!atomic_cmpset_long(pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_flag_clear(m, PG_WRITEABLE); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { pv_entry_t pv, pvf, pvn; pmap_t pmap; pt_entry_t *pte; int rtval = 0; if (m->flags & PG_FICTITIOUS) return (rtval); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pvf = pv; do { pvn = TAILQ_NEXT(pv, pv_list); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); if ((*pte & PG_A) != 0) { atomic_clear_long(pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); rtval++; if (rtval > 4) pvn = NULL; } PMAP_UNLOCK(pmap); } while ((pv = pvn) != NULL && pv != pvf); } return (rtval); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pt_entry_t *pte; if ((m->flags & PG_FICTITIOUS) != 0) return; mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); if (*pte & PG_M) { atomic_clear_long(pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pt_entry_t *pte; if ((m->flags & PG_FICTITIOUS) != 0) return; mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); if (*pte & PG_A) { atomic_clear_long(pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } } /* * Miscellaneous support routines follow */ /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void pmap_pte_attr(vm_offset_t va, int mode) { pt_entry_t *pte; u_int opte, npte; pte = vtopte(va); /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT); npte |= pmap_cache_bits(mode, 0); } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2MB page mapped via a PDE. */ static __inline void pmap_pde_attr(vm_offset_t va, int mode) { pd_entry_t *pde; u_int opde, npde; pde = pmap_pde(kernel_pmap, va); /* * The cache mode bits are all in the low 32-bits of the * PDE, so we can just spin on updating the low 32-bits. */ do { opde = *(u_int *)pde; npde = opde & ~(PG_PDE_PAT | PG_NC_PCD | PG_NC_PWT); npde |= pmap_cache_bits(mode, 1); } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { vm_offset_t va, tmpva, offset; /* * If this fits within the direct map window and use WB caching * mode, use the direct map. */ if (pa < dmaplimit && (pa + size) < dmaplimit && mode == PAT_WRITE_BACK) return ((void *)PHYS_TO_DMAP(pa)); offset = pa & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); va = kmem_alloc_nofault(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pa = trunc_page(pa); for (tmpva = va; size > 0; ) { pmap_kenter_attr(tmpva, pa, mode); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, va, tmpva); pmap_invalidate_cache(); return ((void *)(va + offset)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { vm_offset_t base, offset, tmpva; /* If we gave a direct map region in pmap_mapdev, do nothing */ if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) return; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) pmap_kremove(tmpva); pmap_invalidate_range(kernel_pmap, va, tmpva); kmem_free(kernel_map, base, size); } int pmap_change_attr(va, size, mode) vm_offset_t va; vm_size_t size; int mode; { vm_offset_t base, offset, tmpva; pd_entry_t *pde; pt_entry_t *pte; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); /* Only supported on kernel virtual addresses. */ if (base <= VM_MAXUSER_ADDRESS) return (EINVAL); /* * XXX: We have to support tearing 2MB pages down into 4k pages if * needed here. */ /* Pages that aren't mapped aren't supported. */ for (tmpva = base; tmpva < (base + size); ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde == 0) return (EINVAL); if (*pde & PG_PS) { /* Handle 2MB pages that are completely contained. */ if (size >= NBPDR) { tmpva += NBPDR; continue; } return (EINVAL); } pte = vtopte(va); if (*pte == 0) return (EINVAL); tmpva += PAGE_SIZE; } /* * Ok, all the pages exist, so run through them updating their * cache mode. */ for (tmpva = base; size > 0; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde & PG_PS) { pmap_pde_attr(tmpva, mode); tmpva += NBPDR; size -= NBPDR; } else { pmap_pte_attr(tmpva, mode); tmpva += PAGE_SIZE; size -= PAGE_SIZE; } } /* * Flush CPU caches to make sure any data isn't cached that shouldn't * be, etc. */ pmap_invalidate_range(kernel_pmap, base, tmpva); pmap_invalidate_cache(); return (0); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr) { pt_entry_t *ptep, pte; vm_page_t m; int val = 0; PMAP_LOCK(pmap); ptep = pmap_pte(pmap, addr); pte = (ptep != NULL) ? *ptep : 0; PMAP_UNLOCK(pmap); if (pte != 0) { vm_paddr_t pa; val = MINCORE_INCORE; if ((pte & PG_MANAGED) == 0) return val; pa = pte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); /* * Modified by us */ if (pte & PG_M) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; else { /* * Modified by someone else */ vm_page_lock_queues(); if (m->dirty || pmap_is_modified(m)) val |= MINCORE_MODIFIED_OTHER; vm_page_unlock_queues(); } /* * Referenced by us */ if (pte & PG_A) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; else { /* * Referenced by someone else */ vm_page_lock_queues(); if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { val |= MINCORE_REFERENCED_OTHER; vm_page_flag_set(m, PG_REFERENCED); } vm_page_unlock_queues(); } } return val; } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; u_int64_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); #ifdef SMP if (oldpmap) /* XXX FIXME */ atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); #else if (oldpmap) /* XXX FIXME */ oldpmap->pm_active &= ~PCPU_GET(cpumask); pmap->pm_active |= PCPU_GET(cpumask); #endif cr3 = vtophys(pmap->pm_pml4); td->td_pcb->pcb_cr3 = cr3; load_cr3(cr3); critical_exit(); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { return addr; } addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); return addr; } Index: head/sys/arm/arm/pmap.c =================================================================== --- head/sys/arm/arm/pmap.c (revision 173360) +++ head/sys/arm/arm/pmap.c (revision 173361) @@ -1,4905 +1,4906 @@ /* From: $NetBSD: pmap.c,v 1.148 2004/04/03 04:35:48 bsh Exp $ */ /*- * Copyright 2004 Olivier Houchard. * Copyright 2003 Wasabi Systems, Inc. * All rights reserved. * * Written by Steve C. Woodford for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2002-2003 Wasabi Systems, Inc. * Copyright (c) 2001 Richard Earnshaw * Copyright (c) 2001-2002 Christopher Gilbert * All rights reserved. * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1999 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the NetBSD * Foundation, Inc. and its contributors. * 4. Neither the name of The NetBSD Foundation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1994-1998 Mark Brinicombe. * Copyright (c) 1994 Brini. * All rights reserved. * * This code is derived from software written for Brini by Mark Brinicombe * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Mark Brinicombe. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * * RiscBSD kernel project * * pmap.c * * Machine dependant vm stuff * * Created : 20/09/94 */ /* * Special compilation symbols * PMAP_DEBUG - Build in pmap_debug_level code */ /* Include header files */ #include "opt_vm.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PMAP_DEBUG #define PDEBUG(_lev_,_stat_) \ if (pmap_debug_level >= (_lev_)) \ ((_stat_)) #define dprintf printf int pmap_debug_level = 0; #define PMAP_INLINE #else /* PMAP_DEBUG */ #define PDEBUG(_lev_,_stat_) /* Nothing */ #define dprintf(x, arg...) #define PMAP_INLINE __inline #endif /* PMAP_DEBUG */ extern struct pv_addr systempage; /* * Internal function prototypes */ static void pmap_free_pv_entry (pv_entry_t); static pv_entry_t pmap_get_pv_entry(void); static void pmap_enter_locked(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, boolean_t, int); static void pmap_vac_me_harder(struct vm_page *, pmap_t, vm_offset_t); static void pmap_vac_me_kpmap(struct vm_page *, pmap_t, vm_offset_t); static void pmap_vac_me_user(struct vm_page *, pmap_t, vm_offset_t); static void pmap_alloc_l1(pmap_t); static void pmap_free_l1(pmap_t); static void pmap_use_l1(pmap_t); static int pmap_clearbit(struct vm_page *, u_int); static struct l2_bucket *pmap_get_l2_bucket(pmap_t, vm_offset_t); static struct l2_bucket *pmap_alloc_l2_bucket(pmap_t, vm_offset_t); static void pmap_free_l2_bucket(pmap_t, struct l2_bucket *, u_int); static vm_offset_t kernel_pt_lookup(vm_paddr_t); static MALLOC_DEFINE(M_VMPMAP, "pmap", "PMAP L1"); vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ vm_offset_t pmap_curmaxkvaddr; vm_paddr_t kernel_l1pa; extern void *end; vm_offset_t kernel_vm_end = 0; struct pmap kernel_pmap_store; pmap_t kernel_pmap; static pt_entry_t *csrc_pte, *cdst_pte; static vm_offset_t csrcp, cdstp; static struct mtx cmtx; static void pmap_init_l1(struct l1_ttable *, pd_entry_t *); /* * These routines are called when the CPU type is identified to set up * the PTE prototypes, cache modes, etc. * * The variables are always here, just in case LKMs need to reference * them (though, they shouldn't). */ pt_entry_t pte_l1_s_cache_mode; pt_entry_t pte_l1_s_cache_mode_pt; pt_entry_t pte_l1_s_cache_mask; pt_entry_t pte_l2_l_cache_mode; pt_entry_t pte_l2_l_cache_mode_pt; pt_entry_t pte_l2_l_cache_mask; pt_entry_t pte_l2_s_cache_mode; pt_entry_t pte_l2_s_cache_mode_pt; pt_entry_t pte_l2_s_cache_mask; pt_entry_t pte_l2_s_prot_u; pt_entry_t pte_l2_s_prot_w; pt_entry_t pte_l2_s_prot_mask; pt_entry_t pte_l1_s_proto; pt_entry_t pte_l1_c_proto; pt_entry_t pte_l2_s_proto; void (*pmap_copy_page_func)(vm_paddr_t, vm_paddr_t); void (*pmap_zero_page_func)(vm_paddr_t, int, int); /* * Which pmap is currently 'live' in the cache * * XXXSCW: Fix for SMP ... */ union pmap_cache_state *pmap_cache_state; struct msgbuf *msgbufp = 0; extern void bcopy_page(vm_offset_t, vm_offset_t); extern void bzero_page(vm_offset_t); extern vm_offset_t alloc_firstaddr; char *_tmppt; /* * Metadata for L1 translation tables. */ struct l1_ttable { /* Entry on the L1 Table list */ SLIST_ENTRY(l1_ttable) l1_link; /* Entry on the L1 Least Recently Used list */ TAILQ_ENTRY(l1_ttable) l1_lru; /* Track how many domains are allocated from this L1 */ volatile u_int l1_domain_use_count; /* * A free-list of domain numbers for this L1. * We avoid using ffs() and a bitmap to track domains since ffs() * is slow on ARM. */ u_int8_t l1_domain_first; u_int8_t l1_domain_free[PMAP_DOMAINS]; /* Physical address of this L1 page table */ vm_paddr_t l1_physaddr; /* KVA of this L1 page table */ pd_entry_t *l1_kva; }; /* * Convert a virtual address into its L1 table index. That is, the * index used to locate the L2 descriptor table pointer in an L1 table. * This is basically used to index l1->l1_kva[]. * * Each L2 descriptor table represents 1MB of VA space. */ #define L1_IDX(va) (((vm_offset_t)(va)) >> L1_S_SHIFT) /* * L1 Page Tables are tracked using a Least Recently Used list. * - New L1s are allocated from the HEAD. * - Freed L1s are added to the TAIl. * - Recently accessed L1s (where an 'access' is some change to one of * the userland pmaps which owns this L1) are moved to the TAIL. */ static TAILQ_HEAD(, l1_ttable) l1_lru_list; /* * A list of all L1 tables */ static SLIST_HEAD(, l1_ttable) l1_list; static struct mtx l1_lru_lock; /* * The l2_dtable tracks L2_BUCKET_SIZE worth of L1 slots. * * This is normally 16MB worth L2 page descriptors for any given pmap. * Reference counts are maintained for L2 descriptors so they can be * freed when empty. */ struct l2_dtable { /* The number of L2 page descriptors allocated to this l2_dtable */ u_int l2_occupancy; /* List of L2 page descriptors */ struct l2_bucket { pt_entry_t *l2b_kva; /* KVA of L2 Descriptor Table */ vm_paddr_t l2b_phys; /* Physical address of same */ u_short l2b_l1idx; /* This L2 table's L1 index */ u_short l2b_occupancy; /* How many active descriptors */ } l2_bucket[L2_BUCKET_SIZE]; }; /* pmap_kenter_internal flags */ #define KENTER_CACHE 0x1 #define KENTER_USER 0x2 /* * Given an L1 table index, calculate the corresponding l2_dtable index * and bucket index within the l2_dtable. */ #define L2_IDX(l1idx) (((l1idx) >> L2_BUCKET_LOG2) & \ (L2_SIZE - 1)) #define L2_BUCKET(l1idx) ((l1idx) & (L2_BUCKET_SIZE - 1)) /* * Given a virtual address, this macro returns the * virtual address required to drop into the next L2 bucket. */ #define L2_NEXT_BUCKET(va) (((va) & L1_S_FRAME) + L1_S_SIZE) /* * L2 allocation. */ #define pmap_alloc_l2_dtable() \ (void*)uma_zalloc(l2table_zone, M_NOWAIT|M_USE_RESERVE) #define pmap_free_l2_dtable(l2) \ uma_zfree(l2table_zone, l2) /* * We try to map the page tables write-through, if possible. However, not * all CPUs have a write-through cache mode, so on those we have to sync * the cache when we frob page tables. * * We try to evaluate this at compile time, if possible. However, it's * not always possible to do that, hence this run-time var. */ int pmap_needs_pte_sync; /* * Macro to determine if a mapping might be resident in the * instruction cache and/or TLB */ #define PV_BEEN_EXECD(f) (((f) & (PVF_REF | PVF_EXEC)) == (PVF_REF | PVF_EXEC)) /* * Macro to determine if a mapping might be resident in the * data cache and/or TLB */ #define PV_BEEN_REFD(f) (((f) & PVF_REF) != 0) #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #define pmap_is_current(pm) ((pm) == pmap_kernel() || \ curproc->p_vmspace->vm_map.pmap == (pm)) static uma_zone_t pvzone; uma_zone_t l2zone; static uma_zone_t l2table_zone; static vm_offset_t pmap_kernel_l2dtable_kva; static vm_offset_t pmap_kernel_l2ptp_kva; static vm_paddr_t pmap_kernel_l2ptp_phys; static struct vm_object pvzone_obj; static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; /* * This list exists for the benefit of pmap_map_chunk(). It keeps track * of the kernel L2 tables during bootstrap, so that pmap_map_chunk() can * find them as necessary. * * Note that the data on this list MUST remain valid after initarm() returns, * as pmap_bootstrap() uses it to contruct L2 table metadata. */ SLIST_HEAD(, pv_addr) kernel_pt_list = SLIST_HEAD_INITIALIZER(kernel_pt_list); static void pmap_init_l1(struct l1_ttable *l1, pd_entry_t *l1pt) { int i; l1->l1_kva = l1pt; l1->l1_domain_use_count = 0; l1->l1_domain_first = 1; for (i = 0; i < PMAP_DOMAINS; i++) l1->l1_domain_free[i] = i + 2; /* * Copy the kernel's L1 entries to each new L1. */ if (l1pt != pmap_kernel()->pm_l1->l1_kva) memcpy(l1pt, pmap_kernel()->pm_l1->l1_kva, L1_TABLE_SIZE); if ((l1->l1_physaddr = pmap_extract(pmap_kernel(), (vm_offset_t)l1pt)) == 0) panic("pmap_init_l1: can't get PA of L1 at %p", l1pt); SLIST_INSERT_HEAD(&l1_list, l1, l1_link); TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru); } static vm_offset_t kernel_pt_lookup(vm_paddr_t pa) { struct pv_addr *pv; SLIST_FOREACH(pv, &kernel_pt_list, pv_list) { if (pv->pv_pa == pa) return (pv->pv_va); } return (0); } #if (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 void pmap_pte_init_generic(void) { pte_l1_s_cache_mode = L1_S_B|L1_S_C; pte_l1_s_cache_mask = L1_S_CACHE_MASK_generic; pte_l2_l_cache_mode = L2_B|L2_C; pte_l2_l_cache_mask = L2_L_CACHE_MASK_generic; pte_l2_s_cache_mode = L2_B|L2_C; pte_l2_s_cache_mask = L2_S_CACHE_MASK_generic; /* * If we have a write-through cache, set B and C. If * we have a write-back cache, then we assume setting * only C will make those pages write-through. */ if (cpufuncs.cf_dcache_wb_range == (void *) cpufunc_nullop) { pte_l1_s_cache_mode_pt = L1_S_B|L1_S_C; pte_l2_l_cache_mode_pt = L2_B|L2_C; pte_l2_s_cache_mode_pt = L2_B|L2_C; } else { pte_l1_s_cache_mode_pt = L1_S_C; pte_l2_l_cache_mode_pt = L2_C; pte_l2_s_cache_mode_pt = L2_C; } pte_l2_s_prot_u = L2_S_PROT_U_generic; pte_l2_s_prot_w = L2_S_PROT_W_generic; pte_l2_s_prot_mask = L2_S_PROT_MASK_generic; pte_l1_s_proto = L1_S_PROTO_generic; pte_l1_c_proto = L1_C_PROTO_generic; pte_l2_s_proto = L2_S_PROTO_generic; pmap_copy_page_func = pmap_copy_page_generic; pmap_zero_page_func = pmap_zero_page_generic; } #if defined(CPU_ARM8) void pmap_pte_init_arm8(void) { /* * ARM8 is compatible with generic, but we need to use * the page tables uncached. */ pmap_pte_init_generic(); pte_l1_s_cache_mode_pt = 0; pte_l2_l_cache_mode_pt = 0; pte_l2_s_cache_mode_pt = 0; } #endif /* CPU_ARM8 */ #if defined(CPU_ARM9) && defined(ARM9_CACHE_WRITE_THROUGH) void pmap_pte_init_arm9(void) { /* * ARM9 is compatible with generic, but we want to use * write-through caching for now. */ pmap_pte_init_generic(); pte_l1_s_cache_mode = L1_S_C; pte_l2_l_cache_mode = L2_C; pte_l2_s_cache_mode = L2_C; pte_l1_s_cache_mode_pt = L1_S_C; pte_l2_l_cache_mode_pt = L2_C; pte_l2_s_cache_mode_pt = L2_C; } #endif /* CPU_ARM9 */ #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */ #if defined(CPU_ARM10) void pmap_pte_init_arm10(void) { /* * ARM10 is compatible with generic, but we want to use * write-through caching for now. */ pmap_pte_init_generic(); pte_l1_s_cache_mode = L1_S_B | L1_S_C; pte_l2_l_cache_mode = L2_B | L2_C; pte_l2_s_cache_mode = L2_B | L2_C; pte_l1_s_cache_mode_pt = L1_S_C; pte_l2_l_cache_mode_pt = L2_C; pte_l2_s_cache_mode_pt = L2_C; } #endif /* CPU_ARM10 */ #if ARM_MMU_SA1 == 1 void pmap_pte_init_sa1(void) { /* * The StrongARM SA-1 cache does not have a write-through * mode. So, do the generic initialization, then reset * the page table cache mode to B=1,C=1, and note that * the PTEs need to be sync'd. */ pmap_pte_init_generic(); pte_l1_s_cache_mode_pt = L1_S_B|L1_S_C; pte_l2_l_cache_mode_pt = L2_B|L2_C; pte_l2_s_cache_mode_pt = L2_B|L2_C; pmap_needs_pte_sync = 1; } #endif /* ARM_MMU_SA1 == 1*/ #if ARM_MMU_XSCALE == 1 #if (ARM_NMMUS > 1) || defined (CPU_XSCALE_CORE3) static u_int xscale_use_minidata; #endif void pmap_pte_init_xscale(void) { uint32_t auxctl; int write_through = 0; pte_l1_s_cache_mode = L1_S_B|L1_S_C|L1_S_XSCALE_P; pte_l1_s_cache_mask = L1_S_CACHE_MASK_xscale; pte_l2_l_cache_mode = L2_B|L2_C; pte_l2_l_cache_mask = L2_L_CACHE_MASK_xscale; pte_l2_s_cache_mode = L2_B|L2_C; pte_l2_s_cache_mask = L2_S_CACHE_MASK_xscale; pte_l1_s_cache_mode_pt = L1_S_C; pte_l2_l_cache_mode_pt = L2_C; pte_l2_s_cache_mode_pt = L2_C; #ifdef XSCALE_CACHE_READ_WRITE_ALLOCATE /* * The XScale core has an enhanced mode where writes that * miss the cache cause a cache line to be allocated. This * is significantly faster than the traditional, write-through * behavior of this case. */ pte_l1_s_cache_mode |= L1_S_XSCALE_TEX(TEX_XSCALE_X); pte_l2_l_cache_mode |= L2_XSCALE_L_TEX(TEX_XSCALE_X); pte_l2_s_cache_mode |= L2_XSCALE_T_TEX(TEX_XSCALE_X); #endif /* XSCALE_CACHE_READ_WRITE_ALLOCATE */ #ifdef XSCALE_CACHE_WRITE_THROUGH /* * Some versions of the XScale core have various bugs in * their cache units, the work-around for which is to run * the cache in write-through mode. Unfortunately, this * has a major (negative) impact on performance. So, we * go ahead and run fast-and-loose, in the hopes that we * don't line up the planets in a way that will trip the * bugs. * * However, we give you the option to be slow-but-correct. */ write_through = 1; #elif defined(XSCALE_CACHE_WRITE_BACK) /* force write back cache mode */ write_through = 0; #elif defined(CPU_XSCALE_PXA2X0) /* * Intel PXA2[15]0 processors are known to have a bug in * write-back cache on revision 4 and earlier (stepping * A[01] and B[012]). Fixed for C0 and later. */ { uint32_t id, type; id = cpufunc_id(); type = id & ~(CPU_ID_XSCALE_COREREV_MASK|CPU_ID_REVISION_MASK); if (type == CPU_ID_PXA250 || type == CPU_ID_PXA210) { if ((id & CPU_ID_REVISION_MASK) < 5) { /* write through for stepping A0-1 and B0-2 */ write_through = 1; } } } #endif /* XSCALE_CACHE_WRITE_THROUGH */ if (write_through) { pte_l1_s_cache_mode = L1_S_C; pte_l2_l_cache_mode = L2_C; pte_l2_s_cache_mode = L2_C; } #if (ARM_NMMUS > 1) xscale_use_minidata = 1; #endif pte_l2_s_prot_u = L2_S_PROT_U_xscale; pte_l2_s_prot_w = L2_S_PROT_W_xscale; pte_l2_s_prot_mask = L2_S_PROT_MASK_xscale; pte_l1_s_proto = L1_S_PROTO_xscale; pte_l1_c_proto = L1_C_PROTO_xscale; pte_l2_s_proto = L2_S_PROTO_xscale; #ifdef CPU_XSCALE_CORE3 pmap_copy_page_func = pmap_copy_page_generic; pmap_zero_page_func = pmap_zero_page_generic; xscale_use_minidata = 0; /* Make sure it is L2-cachable */ pte_l1_s_cache_mode |= L1_S_XSCALE_TEX(TEX_XSCALE_T); pte_l1_s_cache_mode_pt = pte_l1_s_cache_mode &~ L1_S_XSCALE_P; pte_l2_l_cache_mode |= L2_XSCALE_L_TEX(TEX_XSCALE_T) ; pte_l2_l_cache_mode_pt = pte_l1_s_cache_mode; pte_l2_s_cache_mode |= L2_XSCALE_T_TEX(TEX_XSCALE_T); pte_l2_s_cache_mode_pt = pte_l2_s_cache_mode; #else pmap_copy_page_func = pmap_copy_page_xscale; pmap_zero_page_func = pmap_zero_page_xscale; #endif /* * Disable ECC protection of page table access, for now. */ __asm __volatile("mrc p15, 0, %0, c1, c0, 1" : "=r" (auxctl)); auxctl &= ~XSCALE_AUXCTL_P; __asm __volatile("mcr p15, 0, %0, c1, c0, 1" : : "r" (auxctl)); } /* * xscale_setup_minidata: * * Set up the mini-data cache clean area. We require the * caller to allocate the right amount of physically and * virtually contiguous space. */ extern vm_offset_t xscale_minidata_clean_addr; extern vm_size_t xscale_minidata_clean_size; /* already initialized */ void xscale_setup_minidata(vm_offset_t l1pt, vm_offset_t va, vm_paddr_t pa) { pd_entry_t *pde = (pd_entry_t *) l1pt; pt_entry_t *pte; vm_size_t size; uint32_t auxctl; xscale_minidata_clean_addr = va; /* Round it to page size. */ size = (xscale_minidata_clean_size + L2_S_OFFSET) & L2_S_FRAME; for (; size != 0; va += L2_S_SIZE, pa += L2_S_SIZE, size -= L2_S_SIZE) { pte = (pt_entry_t *) kernel_pt_lookup( pde[L1_IDX(va)] & L1_C_ADDR_MASK); if (pte == NULL) panic("xscale_setup_minidata: can't find L2 table for " "VA 0x%08x", (u_int32_t) va); pte[l2pte_index(va)] = L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, VM_PROT_READ) | L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X); } /* * Configure the mini-data cache for write-back with * read/write-allocate. * * NOTE: In order to reconfigure the mini-data cache, we must * make sure it contains no valid data! In order to do that, * we must issue a global data cache invalidate command! * * WE ASSUME WE ARE RUNNING UN-CACHED WHEN THIS ROUTINE IS CALLED! * THIS IS VERY IMPORTANT! */ /* Invalidate data and mini-data. */ __asm __volatile("mcr p15, 0, %0, c7, c6, 0" : : "r" (0)); __asm __volatile("mrc p15, 0, %0, c1, c0, 1" : "=r" (auxctl)); auxctl = (auxctl & ~XSCALE_AUXCTL_MD_MASK) | XSCALE_AUXCTL_MD_WB_RWA; __asm __volatile("mcr p15, 0, %0, c1, c0, 1" : : "r" (auxctl)); } #endif /* * Allocate an L1 translation table for the specified pmap. * This is called at pmap creation time. */ static void pmap_alloc_l1(pmap_t pm) { struct l1_ttable *l1; u_int8_t domain; /* * Remove the L1 at the head of the LRU list */ mtx_lock(&l1_lru_lock); l1 = TAILQ_FIRST(&l1_lru_list); TAILQ_REMOVE(&l1_lru_list, l1, l1_lru); /* * Pick the first available domain number, and update * the link to the next number. */ domain = l1->l1_domain_first; l1->l1_domain_first = l1->l1_domain_free[domain]; /* * If there are still free domain numbers in this L1, * put it back on the TAIL of the LRU list. */ if (++l1->l1_domain_use_count < PMAP_DOMAINS) TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru); mtx_unlock(&l1_lru_lock); /* * Fix up the relevant bits in the pmap structure */ pm->pm_l1 = l1; pm->pm_domain = domain; } /* * Free an L1 translation table. * This is called at pmap destruction time. */ static void pmap_free_l1(pmap_t pm) { struct l1_ttable *l1 = pm->pm_l1; mtx_lock(&l1_lru_lock); /* * If this L1 is currently on the LRU list, remove it. */ if (l1->l1_domain_use_count < PMAP_DOMAINS) TAILQ_REMOVE(&l1_lru_list, l1, l1_lru); /* * Free up the domain number which was allocated to the pmap */ l1->l1_domain_free[pm->pm_domain] = l1->l1_domain_first; l1->l1_domain_first = pm->pm_domain; l1->l1_domain_use_count--; /* * The L1 now must have at least 1 free domain, so add * it back to the LRU list. If the use count is zero, * put it at the head of the list, otherwise it goes * to the tail. */ if (l1->l1_domain_use_count == 0) { TAILQ_INSERT_HEAD(&l1_lru_list, l1, l1_lru); } else TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru); mtx_unlock(&l1_lru_lock); } static PMAP_INLINE void pmap_use_l1(pmap_t pm) { struct l1_ttable *l1; /* * Do nothing if we're in interrupt context. * Access to an L1 by the kernel pmap must not affect * the LRU list. */ if (pm == pmap_kernel()) return; l1 = pm->pm_l1; /* * If the L1 is not currently on the LRU list, just return */ if (l1->l1_domain_use_count == PMAP_DOMAINS) return; mtx_lock(&l1_lru_lock); /* * Check the use count again, now that we've acquired the lock */ if (l1->l1_domain_use_count == PMAP_DOMAINS) { mtx_unlock(&l1_lru_lock); return; } /* * Move the L1 to the back of the LRU list */ TAILQ_REMOVE(&l1_lru_list, l1, l1_lru); TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru); mtx_unlock(&l1_lru_lock); } /* * Returns a pointer to the L2 bucket associated with the specified pmap * and VA, or NULL if no L2 bucket exists for the address. */ static PMAP_INLINE struct l2_bucket * pmap_get_l2_bucket(pmap_t pm, vm_offset_t va) { struct l2_dtable *l2; struct l2_bucket *l2b; u_short l1idx; l1idx = L1_IDX(va); if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL || (l2b = &l2->l2_bucket[L2_BUCKET(l1idx)])->l2b_kva == NULL) return (NULL); return (l2b); } /* * Returns a pointer to the L2 bucket associated with the specified pmap * and VA. * * If no L2 bucket exists, perform the necessary allocations to put an L2 * bucket/page table in place. * * Note that if a new L2 bucket/page was allocated, the caller *must* * increment the bucket occupancy counter appropriately *before* * releasing the pmap's lock to ensure no other thread or cpu deallocates * the bucket/page in the meantime. */ static struct l2_bucket * pmap_alloc_l2_bucket(pmap_t pm, vm_offset_t va) { struct l2_dtable *l2; struct l2_bucket *l2b; u_short l1idx; l1idx = L1_IDX(va); PMAP_ASSERT_LOCKED(pm); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL) { /* * No mapping at this address, as there is * no entry in the L1 table. * Need to allocate a new l2_dtable. */ again_l2table: PMAP_UNLOCK(pm); vm_page_unlock_queues(); if ((l2 = pmap_alloc_l2_dtable()) == NULL) { vm_page_lock_queues(); PMAP_LOCK(pm); return (NULL); } vm_page_lock_queues(); PMAP_LOCK(pm); if (pm->pm_l2[L2_IDX(l1idx)] != NULL) { PMAP_UNLOCK(pm); vm_page_unlock_queues(); uma_zfree(l2table_zone, l2); vm_page_lock_queues(); PMAP_LOCK(pm); l2 = pm->pm_l2[L2_IDX(l1idx)]; if (l2 == NULL) goto again_l2table; /* * Someone already allocated the l2_dtable while * we were doing the same. */ } else { bzero(l2, sizeof(*l2)); /* * Link it into the parent pmap */ pm->pm_l2[L2_IDX(l1idx)] = l2; } } l2b = &l2->l2_bucket[L2_BUCKET(l1idx)]; /* * Fetch pointer to the L2 page table associated with the address. */ if (l2b->l2b_kva == NULL) { pt_entry_t *ptep; /* * No L2 page table has been allocated. Chances are, this * is because we just allocated the l2_dtable, above. */ again_ptep: PMAP_UNLOCK(pm); vm_page_unlock_queues(); ptep = (void*)uma_zalloc(l2zone, M_NOWAIT|M_USE_RESERVE); vm_page_lock_queues(); PMAP_LOCK(pm); if (l2b->l2b_kva != 0) { /* We lost the race. */ PMAP_UNLOCK(pm); vm_page_unlock_queues(); uma_zfree(l2zone, ptep); vm_page_lock_queues(); PMAP_LOCK(pm); if (l2b->l2b_kva == 0) goto again_ptep; return (l2b); } l2b->l2b_phys = vtophys(ptep); if (ptep == NULL) { /* * Oops, no more L2 page tables available at this * time. We may need to deallocate the l2_dtable * if we allocated a new one above. */ if (l2->l2_occupancy == 0) { pm->pm_l2[L2_IDX(l1idx)] = NULL; pmap_free_l2_dtable(l2); } return (NULL); } l2->l2_occupancy++; l2b->l2b_kva = ptep; l2b->l2b_l1idx = l1idx; } return (l2b); } static PMAP_INLINE void #ifndef PMAP_INCLUDE_PTE_SYNC pmap_free_l2_ptp(pt_entry_t *l2) #else pmap_free_l2_ptp(boolean_t need_sync, pt_entry_t *l2) #endif { #ifdef PMAP_INCLUDE_PTE_SYNC /* * Note: With a write-back cache, we may need to sync this * L2 table before re-using it. * This is because it may have belonged to a non-current * pmap, in which case the cache syncs would have been * skipped when the pages were being unmapped. If the * L2 table were then to be immediately re-allocated to * the *current* pmap, it may well contain stale mappings * which have not yet been cleared by a cache write-back * and so would still be visible to the mmu. */ if (need_sync) PTE_SYNC_RANGE(l2, L2_TABLE_SIZE_REAL / sizeof(pt_entry_t)); #endif uma_zfree(l2zone, l2); } /* * One or more mappings in the specified L2 descriptor table have just been * invalidated. * * Garbage collect the metadata and descriptor table itself if necessary. * * The pmap lock must be acquired when this is called (not necessary * for the kernel pmap). */ static void pmap_free_l2_bucket(pmap_t pm, struct l2_bucket *l2b, u_int count) { struct l2_dtable *l2; pd_entry_t *pl1pd, l1pd; pt_entry_t *ptep; u_short l1idx; /* * Update the bucket's reference count according to how many * PTEs the caller has just invalidated. */ l2b->l2b_occupancy -= count; /* * Note: * * Level 2 page tables allocated to the kernel pmap are never freed * as that would require checking all Level 1 page tables and * removing any references to the Level 2 page table. See also the * comment elsewhere about never freeing bootstrap L2 descriptors. * * We make do with just invalidating the mapping in the L2 table. * * This isn't really a big deal in practice and, in fact, leads * to a performance win over time as we don't need to continually * alloc/free. */ if (l2b->l2b_occupancy > 0 || pm == pmap_kernel()) return; /* * There are no more valid mappings in this level 2 page table. * Go ahead and NULL-out the pointer in the bucket, then * free the page table. */ l1idx = l2b->l2b_l1idx; ptep = l2b->l2b_kva; l2b->l2b_kva = NULL; pl1pd = &pm->pm_l1->l1_kva[l1idx]; /* * If the L1 slot matches the pmap's domain * number, then invalidate it. */ l1pd = *pl1pd & (L1_TYPE_MASK | L1_C_DOM_MASK); if (l1pd == (L1_C_DOM(pm->pm_domain) | L1_TYPE_C)) { *pl1pd = 0; PTE_SYNC(pl1pd); } /* * Release the L2 descriptor table back to the pool cache. */ #ifndef PMAP_INCLUDE_PTE_SYNC pmap_free_l2_ptp(ptep); #else pmap_free_l2_ptp(!pmap_is_current(pm), ptep); #endif /* * Update the reference count in the associated l2_dtable */ l2 = pm->pm_l2[L2_IDX(l1idx)]; if (--l2->l2_occupancy > 0) return; /* * There are no more valid mappings in any of the Level 1 * slots managed by this l2_dtable. Go ahead and NULL-out * the pointer in the parent pmap and free the l2_dtable. */ pm->pm_l2[L2_IDX(l1idx)] = NULL; pmap_free_l2_dtable(l2); } /* * Pool cache constructors for L2 descriptor tables, metadata and pmap * structures. */ static int pmap_l2ptp_ctor(void *mem, int size, void *arg, int flags) { #ifndef PMAP_INCLUDE_PTE_SYNC struct l2_bucket *l2b; pt_entry_t *ptep, pte; #ifdef ARM_USE_SMALL_ALLOC pd_entry_t *pde; #endif vm_offset_t va = (vm_offset_t)mem & ~PAGE_MASK; /* * The mappings for these page tables were initially made using * pmap_kenter() by the pool subsystem. Therefore, the cache- * mode will not be right for page table mappings. To avoid * polluting the pmap_kenter() code with a special case for * page tables, we simply fix up the cache-mode here if it's not * correct. */ #ifdef ARM_USE_SMALL_ALLOC pde = &kernel_pmap->pm_l1->l1_kva[L1_IDX(va)]; if (!l1pte_section_p(*pde)) { #endif l2b = pmap_get_l2_bucket(pmap_kernel(), va); ptep = &l2b->l2b_kva[l2pte_index(va)]; pte = *ptep; if ((pte & L2_S_CACHE_MASK) != pte_l2_s_cache_mode_pt) { /* * Page tables must have the cache-mode set to * Write-Thru. */ *ptep = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt; PTE_SYNC(ptep); cpu_tlb_flushD_SE(va); cpu_cpwait(); } #ifdef ARM_USE_SMALL_ALLOC } #endif #endif memset(mem, 0, L2_TABLE_SIZE_REAL); PTE_SYNC_RANGE(mem, L2_TABLE_SIZE_REAL / sizeof(pt_entry_t)); return (0); } /* * A bunch of routines to conditionally flush the caches/TLB depending * on whether the specified pmap actually needs to be flushed at any * given time. */ static PMAP_INLINE void pmap_tlb_flushID_SE(pmap_t pm, vm_offset_t va) { if (pmap_is_current(pm)) cpu_tlb_flushID_SE(va); } static PMAP_INLINE void pmap_tlb_flushD_SE(pmap_t pm, vm_offset_t va) { if (pmap_is_current(pm)) cpu_tlb_flushD_SE(va); } static PMAP_INLINE void pmap_tlb_flushID(pmap_t pm) { if (pmap_is_current(pm)) cpu_tlb_flushID(); } static PMAP_INLINE void pmap_tlb_flushD(pmap_t pm) { if (pmap_is_current(pm)) cpu_tlb_flushD(); } static PMAP_INLINE void pmap_idcache_wbinv_range(pmap_t pm, vm_offset_t va, vm_size_t len) { if (pmap_is_current(pm)) cpu_idcache_wbinv_range(va, len); } static PMAP_INLINE void pmap_dcache_wb_range(pmap_t pm, vm_offset_t va, vm_size_t len, boolean_t do_inv, boolean_t rd_only) { if (pmap_is_current(pm)) { if (do_inv) { if (rd_only) cpu_dcache_inv_range(va, len); else cpu_dcache_wbinv_range(va, len); } else if (!rd_only) cpu_dcache_wb_range(va, len); } } static PMAP_INLINE void pmap_idcache_wbinv_all(pmap_t pm) { if (pmap_is_current(pm)) cpu_idcache_wbinv_all(); } static PMAP_INLINE void pmap_dcache_wbinv_all(pmap_t pm) { if (pmap_is_current(pm)) cpu_dcache_wbinv_all(); } /* * PTE_SYNC_CURRENT: * * Make sure the pte is written out to RAM. * We need to do this for one of two cases: * - We're dealing with the kernel pmap * - There is no pmap active in the cache/tlb. * - The specified pmap is 'active' in the cache/tlb. */ #ifdef PMAP_INCLUDE_PTE_SYNC #define PTE_SYNC_CURRENT(pm, ptep) \ do { \ if (PMAP_NEEDS_PTE_SYNC && \ pmap_is_current(pm)) \ PTE_SYNC(ptep); \ } while (/*CONSTCOND*/0) #else #define PTE_SYNC_CURRENT(pm, ptep) /* nothing */ #endif /* * Since we have a virtually indexed cache, we may need to inhibit caching if * there is more than one mapping and at least one of them is writable. * Since we purge the cache on every context switch, we only need to check for * other mappings within the same pmap, or kernel_pmap. * This function is also called when a page is unmapped, to possibly reenable * caching on any remaining mappings. * * The code implements the following logic, where: * * KW = # of kernel read/write pages * KR = # of kernel read only pages * UW = # of user read/write pages * UR = # of user read only pages * * KC = kernel mapping is cacheable * UC = user mapping is cacheable * * KW=0,KR=0 KW=0,KR>0 KW=1,KR=0 KW>1,KR>=0 * +--------------------------------------------- * UW=0,UR=0 | --- KC=1 KC=1 KC=0 * UW=0,UR>0 | UC=1 KC=1,UC=1 KC=0,UC=0 KC=0,UC=0 * UW=1,UR=0 | UC=1 KC=0,UC=0 KC=0,UC=0 KC=0,UC=0 * UW>1,UR>=0 | UC=0 KC=0,UC=0 KC=0,UC=0 KC=0,UC=0 */ static const int pmap_vac_flags[4][4] = { {-1, 0, 0, PVF_KNC}, {0, 0, PVF_NC, PVF_NC}, {0, PVF_NC, PVF_NC, PVF_NC}, {PVF_UNC, PVF_NC, PVF_NC, PVF_NC} }; static PMAP_INLINE int pmap_get_vac_flags(const struct vm_page *pg) { int kidx, uidx; kidx = 0; if (pg->md.kro_mappings || pg->md.krw_mappings > 1) kidx |= 1; if (pg->md.krw_mappings) kidx |= 2; uidx = 0; if (pg->md.uro_mappings || pg->md.urw_mappings > 1) uidx |= 1; if (pg->md.urw_mappings) uidx |= 2; return (pmap_vac_flags[uidx][kidx]); } static __inline void pmap_vac_me_harder(struct vm_page *pg, pmap_t pm, vm_offset_t va) { int nattr; mtx_assert(&vm_page_queue_mtx, MA_OWNED); nattr = pmap_get_vac_flags(pg); if (nattr < 0) { pg->md.pvh_attrs &= ~PVF_NC; return; } if (nattr == 0 && (pg->md.pvh_attrs & PVF_NC) == 0) { return; } if (pm == pmap_kernel()) pmap_vac_me_kpmap(pg, pm, va); else pmap_vac_me_user(pg, pm, va); pg->md.pvh_attrs = (pg->md.pvh_attrs & ~PVF_NC) | nattr; } static void pmap_vac_me_kpmap(struct vm_page *pg, pmap_t pm, vm_offset_t va) { u_int u_cacheable, u_entries; struct pv_entry *pv; pmap_t last_pmap = pm; /* * Pass one, see if there are both kernel and user pmaps for * this page. Calculate whether there are user-writable or * kernel-writable pages. */ u_cacheable = 0; TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) { if (pv->pv_pmap != pm && (pv->pv_flags & PVF_NC) == 0) u_cacheable++; } u_entries = pg->md.urw_mappings + pg->md.uro_mappings; /* * We know we have just been updating a kernel entry, so if * all user pages are already cacheable, then there is nothing * further to do. */ if (pg->md.k_mappings == 0 && u_cacheable == u_entries) return; if (u_entries) { /* * Scan over the list again, for each entry, if it * might not be set correctly, call pmap_vac_me_user * to recalculate the settings. */ TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) { /* * We know kernel mappings will get set * correctly in other calls. We also know * that if the pmap is the same as last_pmap * then we've just handled this entry. */ if (pv->pv_pmap == pm || pv->pv_pmap == last_pmap) continue; /* * If there are kernel entries and this page * is writable but non-cacheable, then we can * skip this entry also. */ if (pg->md.k_mappings && (pv->pv_flags & (PVF_NC | PVF_WRITE)) == (PVF_NC | PVF_WRITE)) continue; /* * Similarly if there are no kernel-writable * entries and the page is already * read-only/cacheable. */ if (pg->md.krw_mappings == 0 && (pv->pv_flags & (PVF_NC | PVF_WRITE)) == 0) continue; /* * For some of the remaining cases, we know * that we must recalculate, but for others we * can't tell if they are correct or not, so * we recalculate anyway. */ pmap_vac_me_user(pg, (last_pmap = pv->pv_pmap), 0); } if (pg->md.k_mappings == 0) return; } pmap_vac_me_user(pg, pm, va); } static void pmap_vac_me_user(struct vm_page *pg, pmap_t pm, vm_offset_t va) { pmap_t kpmap = pmap_kernel(); struct pv_entry *pv, *npv; struct l2_bucket *l2b; pt_entry_t *ptep, pte; u_int entries = 0; u_int writable = 0; u_int cacheable_entries = 0; u_int kern_cacheable = 0; u_int other_writable = 0; /* * Count mappings and writable mappings in this pmap. * Include kernel mappings as part of our own. * Keep a pointer to the first one. */ npv = TAILQ_FIRST(&pg->md.pv_list); TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) { /* Count mappings in the same pmap */ if (pm == pv->pv_pmap || kpmap == pv->pv_pmap) { if (entries++ == 0) npv = pv; /* Cacheable mappings */ if ((pv->pv_flags & PVF_NC) == 0) { cacheable_entries++; if (kpmap == pv->pv_pmap) kern_cacheable++; } /* Writable mappings */ if (pv->pv_flags & PVF_WRITE) ++writable; } else if (pv->pv_flags & PVF_WRITE) other_writable = 1; } /* * Enable or disable caching as necessary. * Note: the first entry might be part of the kernel pmap, * so we can't assume this is indicative of the state of the * other (maybe non-kpmap) entries. */ if ((entries > 1 && writable) || (entries > 0 && pm == kpmap && other_writable)) { if (cacheable_entries == 0) return; for (pv = npv; pv; pv = TAILQ_NEXT(pv, pv_list)) { if ((pm != pv->pv_pmap && kpmap != pv->pv_pmap) || (pv->pv_flags & PVF_NC)) continue; pv->pv_flags |= PVF_NC; l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va); ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)]; pte = *ptep & ~L2_S_CACHE_MASK; if ((va != pv->pv_va || pm != pv->pv_pmap) && l2pte_valid(pte)) { if (PV_BEEN_EXECD(pv->pv_flags)) { pmap_idcache_wbinv_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE); pmap_tlb_flushID_SE(pv->pv_pmap, pv->pv_va); } else if (PV_BEEN_REFD(pv->pv_flags)) { pmap_dcache_wb_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE, TRUE, (pv->pv_flags & PVF_WRITE) == 0); pmap_tlb_flushD_SE(pv->pv_pmap, pv->pv_va); } } *ptep = pte; PTE_SYNC_CURRENT(pv->pv_pmap, ptep); } cpu_cpwait(); } else if (entries > cacheable_entries) { /* * Turn cacheing back on for some pages. If it is a kernel * page, only do so if there are no other writable pages. */ for (pv = npv; pv; pv = TAILQ_NEXT(pv, pv_list)) { if (!(pv->pv_flags & PVF_NC) || (pm != pv->pv_pmap && (kpmap != pv->pv_pmap || other_writable))) continue; pv->pv_flags &= ~PVF_NC; l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va); ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)]; pte = (*ptep & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode; if (l2pte_valid(pte)) { if (PV_BEEN_EXECD(pv->pv_flags)) { pmap_tlb_flushID_SE(pv->pv_pmap, pv->pv_va); } else if (PV_BEEN_REFD(pv->pv_flags)) { pmap_tlb_flushD_SE(pv->pv_pmap, pv->pv_va); } } *ptep = pte; PTE_SYNC_CURRENT(pv->pv_pmap, ptep); } } } /* * Modify pte bits for all ptes corresponding to the given physical address. * We use `maskbits' rather than `clearbits' because we're always passing * constants and the latter would require an extra inversion at run-time. */ static int pmap_clearbit(struct vm_page *pg, u_int maskbits) { struct l2_bucket *l2b; struct pv_entry *pv; pt_entry_t *ptep, npte, opte; pmap_t pm; vm_offset_t va; u_int oflags; int count = 0; mtx_assert(&vm_page_queue_mtx, MA_OWNED); /* * Clear saved attributes (modify, reference) */ pg->md.pvh_attrs &= ~(maskbits & (PVF_MOD | PVF_REF)); if (TAILQ_EMPTY(&pg->md.pv_list)) { return (0); } /* * Loop over all current mappings setting/clearing as appropos */ TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) { va = pv->pv_va; pm = pv->pv_pmap; oflags = pv->pv_flags; pv->pv_flags &= ~maskbits; PMAP_LOCK(pm); l2b = pmap_get_l2_bucket(pm, va); ptep = &l2b->l2b_kva[l2pte_index(va)]; npte = opte = *ptep; if (maskbits & (PVF_WRITE|PVF_MOD)) { if ((pv->pv_flags & PVF_NC)) { /* * Entry is not cacheable: * * Don't turn caching on again if this is a * modified emulation. This would be * inconsitent with the settings created by * pmap_vac_me_harder(). Otherwise, it's safe * to re-enable cacheing. * * There's no need to call pmap_vac_me_harder() * here: all pages are losing their write * permission. */ if (maskbits & PVF_WRITE) { npte |= pte_l2_s_cache_mode; pv->pv_flags &= ~PVF_NC; } } else if (opte & L2_S_PROT_W) { vm_page_dirty(pg); /* * Entry is writable/cacheable: check if pmap * is current if it is flush it, otherwise it * won't be in the cache */ if (PV_BEEN_EXECD(oflags)) pmap_idcache_wbinv_range(pm, pv->pv_va, PAGE_SIZE); else if (PV_BEEN_REFD(oflags)) pmap_dcache_wb_range(pm, pv->pv_va, PAGE_SIZE, (maskbits & PVF_REF) ? TRUE : FALSE, FALSE); } /* make the pte read only */ npte &= ~L2_S_PROT_W; if (maskbits & PVF_WRITE) { /* * Keep alias accounting up to date */ if (pv->pv_pmap == pmap_kernel()) { if (oflags & PVF_WRITE) { pg->md.krw_mappings--; pg->md.kro_mappings++; } } else if (oflags & PVF_WRITE) { pg->md.urw_mappings--; pg->md.uro_mappings++; } } } if (maskbits & PVF_REF) { if ((pv->pv_flags & PVF_NC) == 0 && (maskbits & (PVF_WRITE|PVF_MOD)) == 0) { /* * Check npte here; we may have already * done the wbinv above, and the validity * of the PTE is the same for opte and * npte. */ if (npte & L2_S_PROT_W) { if (PV_BEEN_EXECD(oflags)) pmap_idcache_wbinv_range(pm, pv->pv_va, PAGE_SIZE); else if (PV_BEEN_REFD(oflags)) pmap_dcache_wb_range(pm, pv->pv_va, PAGE_SIZE, TRUE, FALSE); } else if ((npte & L2_TYPE_MASK) != L2_TYPE_INV) { /* XXXJRT need idcache_inv_range */ if (PV_BEEN_EXECD(oflags)) pmap_idcache_wbinv_range(pm, pv->pv_va, PAGE_SIZE); else if (PV_BEEN_REFD(oflags)) pmap_dcache_wb_range(pm, pv->pv_va, PAGE_SIZE, TRUE, TRUE); } } /* * Make the PTE invalid so that we will take a * page fault the next time the mapping is * referenced. */ npte &= ~L2_TYPE_MASK; npte |= L2_TYPE_INV; } if (npte != opte) { count++; *ptep = npte; PTE_SYNC(ptep); /* Flush the TLB entry if a current pmap. */ if (PV_BEEN_EXECD(oflags)) pmap_tlb_flushID_SE(pm, pv->pv_va); else if (PV_BEEN_REFD(oflags)) pmap_tlb_flushD_SE(pm, pv->pv_va); } PMAP_UNLOCK(pm); } if (maskbits & PVF_WRITE) vm_page_flag_clear(pg, PG_WRITEABLE); return (count); } /* * main pv_entry manipulation functions: * pmap_enter_pv: enter a mapping onto a vm_page list * pmap_remove_pv: remove a mappiing from a vm_page list * * NOTE: pmap_enter_pv expects to lock the pvh itself * pmap_remove_pv expects te caller to lock the pvh before calling */ /* * pmap_enter_pv: enter a mapping onto a vm_page lst * * => caller should hold the proper lock on pmap_main_lock * => caller should have pmap locked * => we will gain the lock on the vm_page and allocate the new pv_entry * => caller should adjust ptp's wire_count before calling * => caller should not adjust pmap's wire_count */ static void pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, pmap_t pm, vm_offset_t va, u_int flags) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_ASSERT_LOCKED(pm); pve->pv_pmap = pm; pve->pv_va = va; pve->pv_flags = flags; TAILQ_INSERT_HEAD(&pg->md.pv_list, pve, pv_list); TAILQ_INSERT_HEAD(&pm->pm_pvlist, pve, pv_plist); pg->md.pvh_attrs |= flags & (PVF_REF | PVF_MOD); if (pm == pmap_kernel()) { if (flags & PVF_WRITE) pg->md.krw_mappings++; else pg->md.kro_mappings++; } if (flags & PVF_WRITE) pg->md.urw_mappings++; else pg->md.uro_mappings++; pg->md.pv_list_count++; if (pve->pv_flags & PVF_WIRED) ++pm->pm_stats.wired_count; vm_page_flag_set(pg, PG_REFERENCED); } /* * * pmap_find_pv: Find a pv entry * * => caller should hold lock on vm_page */ static PMAP_INLINE struct pv_entry * pmap_find_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va) { struct pv_entry *pv; mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) if (pm == pv->pv_pmap && va == pv->pv_va) break; return (pv); } /* * vector_page_setprot: * * Manipulate the protection of the vector page. */ void vector_page_setprot(int prot) { struct l2_bucket *l2b; pt_entry_t *ptep; l2b = pmap_get_l2_bucket(pmap_kernel(), vector_page); ptep = &l2b->l2b_kva[l2pte_index(vector_page)]; *ptep = (*ptep & ~L1_S_PROT_MASK) | L2_S_PROT(PTE_KERNEL, prot); PTE_SYNC(ptep); cpu_tlb_flushD_SE(vector_page); cpu_cpwait(); } /* * pmap_remove_pv: try to remove a mapping from a pv_list * * => caller should hold proper lock on pmap_main_lock * => pmap should be locked * => caller should hold lock on vm_page [so that attrs can be adjusted] * => caller should adjust ptp's wire_count and free PTP if needed * => caller should NOT adjust pmap's wire_count * => we return the removed pve */ static void pmap_nuke_pv(struct vm_page *pg, pmap_t pm, struct pv_entry *pve) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_ASSERT_LOCKED(pm); TAILQ_REMOVE(&pg->md.pv_list, pve, pv_list); TAILQ_REMOVE(&pm->pm_pvlist, pve, pv_plist); if (pve->pv_flags & PVF_WIRED) --pm->pm_stats.wired_count; pg->md.pv_list_count--; if (pg->md.pvh_attrs & PVF_MOD) vm_page_dirty(pg); if (pm == pmap_kernel()) { if (pve->pv_flags & PVF_WRITE) pg->md.krw_mappings--; else pg->md.kro_mappings--; } else if (pve->pv_flags & PVF_WRITE) pg->md.urw_mappings--; else pg->md.uro_mappings--; if (TAILQ_FIRST(&pg->md.pv_list) == NULL || (pg->md.krw_mappings == 0 && pg->md.urw_mappings == 0)) { pg->md.pvh_attrs &= ~PVF_MOD; if (TAILQ_FIRST(&pg->md.pv_list) == NULL) pg->md.pvh_attrs &= ~PVF_REF; vm_page_flag_clear(pg, PG_WRITEABLE); } if (TAILQ_FIRST(&pg->md.pv_list)) vm_page_flag_set(pg, PG_REFERENCED); if (pve->pv_flags & PVF_WRITE) pmap_vac_me_harder(pg, pm, 0); } static struct pv_entry * pmap_remove_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va) { struct pv_entry *pve; mtx_assert(&vm_page_queue_mtx, MA_OWNED); pve = TAILQ_FIRST(&pg->md.pv_list); while (pve) { if (pve->pv_pmap == pm && pve->pv_va == va) { /* match? */ pmap_nuke_pv(pg, pm, pve); break; } pve = TAILQ_NEXT(pve, pv_list); } return(pve); /* return removed pve */ } /* * * pmap_modify_pv: Update pv flags * * => caller should hold lock on vm_page [so that attrs can be adjusted] * => caller should NOT adjust pmap's wire_count * => caller must call pmap_vac_me_harder() if writable status of a page * may have changed. * => we return the old flags * * Modify a physical-virtual mapping in the pv table */ static u_int pmap_modify_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va, u_int clr_mask, u_int set_mask) { struct pv_entry *npv; u_int flags, oflags; PMAP_ASSERT_LOCKED(pm); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((npv = pmap_find_pv(pg, pm, va)) == NULL) return (0); /* * There is at least one VA mapping this page. */ if (clr_mask & (PVF_REF | PVF_MOD)) pg->md.pvh_attrs |= set_mask & (PVF_REF | PVF_MOD); oflags = npv->pv_flags; npv->pv_flags = flags = (oflags & ~clr_mask) | set_mask; if ((flags ^ oflags) & PVF_WIRED) { if (flags & PVF_WIRED) ++pm->pm_stats.wired_count; else --pm->pm_stats.wired_count; } if ((flags ^ oflags) & PVF_WRITE) { if (pm == pmap_kernel()) { if (flags & PVF_WRITE) { pg->md.krw_mappings++; pg->md.kro_mappings--; } else { pg->md.kro_mappings++; pg->md.krw_mappings--; } } else if (flags & PVF_WRITE) { pg->md.urw_mappings++; pg->md.uro_mappings--; } else { pg->md.uro_mappings++; pg->md.urw_mappings--; } if (pg->md.krw_mappings == 0 && pg->md.urw_mappings == 0) { pg->md.pvh_attrs &= ~PVF_MOD; vm_page_flag_clear(pg, PG_WRITEABLE); } pmap_vac_me_harder(pg, pm, 0); } return (oflags); } /* Function to set the debug level of the pmap code */ #ifdef PMAP_DEBUG void pmap_debug(int level) { pmap_debug_level = level; dprintf("pmap_debug: level=%d\n", pmap_debug_level); } #endif /* PMAP_DEBUG */ void pmap_pinit0(struct pmap *pmap) { PDEBUG(1, printf("pmap_pinit0: pmap = %08x\n", (u_int32_t) pmap)); dprintf("pmap_pinit0: pmap = %08x, pm_pdir = %08x\n", (u_int32_t) pmap, (u_int32_t) pmap->pm_pdir); bcopy(kernel_pmap, pmap, sizeof(*pmap)); bzero(&pmap->pm_mtx, sizeof(pmap->pm_mtx)); PMAP_LOCK_INIT(pmap); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_list_count = 0; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { int shpgperproc = PMAP_SHPGPERPROC; PDEBUG(1, printf("pmap_init: phys_start = %08x\n")); /* * init the pv free list */ pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); /* * Now it is safe to enable pv_table recording. */ PDEBUG(1, printf("pmap_init: done!\n")); TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; pv_entry_high_water = 9 * (pv_entry_max / 10); l2zone = uma_zcreate("L2 Table", L2_TABLE_SIZE_REAL, pmap_l2ptp_ctor, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); l2table_zone = uma_zcreate("L2 Table", sizeof(struct l2_dtable), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max); } int pmap_fault_fixup(pmap_t pm, vm_offset_t va, vm_prot_t ftype, int user) { struct l2_dtable *l2; struct l2_bucket *l2b; pd_entry_t *pl1pd, l1pd; pt_entry_t *ptep, pte; vm_paddr_t pa; u_int l1idx; int rv = 0; l1idx = L1_IDX(va); vm_page_lock_queues(); PMAP_LOCK(pm); /* * If there is no l2_dtable for this address, then the process * has no business accessing it. * * Note: This will catch userland processes trying to access * kernel addresses. */ l2 = pm->pm_l2[L2_IDX(l1idx)]; if (l2 == NULL) goto out; /* * Likewise if there is no L2 descriptor table */ l2b = &l2->l2_bucket[L2_BUCKET(l1idx)]; if (l2b->l2b_kva == NULL) goto out; /* * Check the PTE itself. */ ptep = &l2b->l2b_kva[l2pte_index(va)]; pte = *ptep; if (pte == 0) goto out; /* * Catch a userland access to the vector page mapped at 0x0 */ if (user && (pte & L2_S_PROT_U) == 0) goto out; if (va == vector_page) goto out; pa = l2pte_pa(pte); if ((ftype & VM_PROT_WRITE) && (pte & L2_S_PROT_W) == 0) { /* * This looks like a good candidate for "page modified" * emulation... */ struct pv_entry *pv; struct vm_page *pg; /* Extract the physical address of the page */ if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL) { goto out; } /* Get the current flags for this page. */ pv = pmap_find_pv(pg, pm, va); if (pv == NULL) { goto out; } /* * Do the flags say this page is writable? If not then it * is a genuine write fault. If yes then the write fault is * our fault as we did not reflect the write access in the * PTE. Now we know a write has occurred we can correct this * and also set the modified bit */ if ((pv->pv_flags & PVF_WRITE) == 0) { goto out; } pg->md.pvh_attrs |= PVF_REF | PVF_MOD; vm_page_dirty(pg); pv->pv_flags |= PVF_REF | PVF_MOD; /* * Re-enable write permissions for the page. No need to call * pmap_vac_me_harder(), since this is just a * modified-emulation fault, and the PVF_WRITE bit isn't * changing. We've already set the cacheable bits based on * the assumption that we can write to this page. */ *ptep = (pte & ~L2_TYPE_MASK) | L2_S_PROTO | L2_S_PROT_W; PTE_SYNC(ptep); rv = 1; } else if ((pte & L2_TYPE_MASK) == L2_TYPE_INV) { /* * This looks like a good candidate for "page referenced" * emulation. */ struct pv_entry *pv; struct vm_page *pg; /* Extract the physical address of the page */ if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL) goto out; /* Get the current flags for this page. */ pv = pmap_find_pv(pg, pm, va); if (pv == NULL) goto out; pg->md.pvh_attrs |= PVF_REF; pv->pv_flags |= PVF_REF; *ptep = (pte & ~L2_TYPE_MASK) | L2_S_PROTO; PTE_SYNC(ptep); rv = 1; } /* * We know there is a valid mapping here, so simply * fix up the L1 if necessary. */ pl1pd = &pm->pm_l1->l1_kva[l1idx]; l1pd = l2b->l2b_phys | L1_C_DOM(pm->pm_domain) | L1_C_PROTO; if (*pl1pd != l1pd) { *pl1pd = l1pd; PTE_SYNC(pl1pd); rv = 1; } #ifdef CPU_SA110 /* * There are bugs in the rev K SA110. This is a check for one * of them. */ if (rv == 0 && curcpu()->ci_arm_cputype == CPU_ID_SA110 && curcpu()->ci_arm_cpurev < 3) { /* Always current pmap */ if (l2pte_valid(pte)) { extern int kernel_debug; if (kernel_debug & 1) { struct proc *p = curlwp->l_proc; printf("prefetch_abort: page is already " "mapped - pte=%p *pte=%08x\n", ptep, pte); printf("prefetch_abort: pc=%08lx proc=%p " "process=%s\n", va, p, p->p_comm); printf("prefetch_abort: far=%08x fs=%x\n", cpu_faultaddress(), cpu_faultstatus()); } #ifdef DDB if (kernel_debug & 2) Debugger(); #endif rv = 1; } } #endif /* CPU_SA110 */ #ifdef DEBUG /* * If 'rv == 0' at this point, it generally indicates that there is a * stale TLB entry for the faulting address. This happens when two or * more processes are sharing an L1. Since we don't flush the TLB on * a context switch between such processes, we can take domain faults * for mappings which exist at the same VA in both processes. EVEN IF * WE'VE RECENTLY FIXED UP THE CORRESPONDING L1 in pmap_enter(), for * example. * * This is extremely likely to happen if pmap_enter() updated the L1 * entry for a recently entered mapping. In this case, the TLB is * flushed for the new mapping, but there may still be TLB entries for * other mappings belonging to other processes in the 1MB range * covered by the L1 entry. * * Since 'rv == 0', we know that the L1 already contains the correct * value, so the fault must be due to a stale TLB entry. * * Since we always need to flush the TLB anyway in the case where we * fixed up the L1, or frobbed the L2 PTE, we effectively deal with * stale TLB entries dynamically. * * However, the above condition can ONLY happen if the current L1 is * being shared. If it happens when the L1 is unshared, it indicates * that other parts of the pmap are not doing their job WRT managing * the TLB. */ if (rv == 0 && pm->pm_l1->l1_domain_use_count == 1) { extern int last_fault_code; printf("fixup: pm %p, va 0x%lx, ftype %d - nothing to do!\n", pm, va, ftype); printf("fixup: l2 %p, l2b %p, ptep %p, pl1pd %p\n", l2, l2b, ptep, pl1pd); printf("fixup: pte 0x%x, l1pd 0x%x, last code 0x%x\n", pte, l1pd, last_fault_code); #ifdef DDB Debugger(); #endif } #endif cpu_tlb_flushID_SE(va); cpu_cpwait(); rv = 1; out: vm_page_unlock_queues(); PMAP_UNLOCK(pm); return (rv); } void pmap_postinit(void) { struct l2_bucket *l2b; struct l1_ttable *l1; pd_entry_t *pl1pt; pt_entry_t *ptep, pte; vm_offset_t va, eva; u_int loop, needed; needed = (maxproc / PMAP_DOMAINS) + ((maxproc % PMAP_DOMAINS) ? 1 : 0); needed -= 1; l1 = malloc(sizeof(*l1) * needed, M_VMPMAP, M_WAITOK); for (loop = 0; loop < needed; loop++, l1++) { /* Allocate a L1 page table */ va = (vm_offset_t)contigmalloc(L1_TABLE_SIZE, M_VMPMAP, 0, 0x0, 0xffffffff, L1_TABLE_SIZE, 0); if (va == 0) panic("Cannot allocate L1 KVM"); eva = va + L1_TABLE_SIZE; pl1pt = (pd_entry_t *)va; while (va < eva) { l2b = pmap_get_l2_bucket(pmap_kernel(), va); ptep = &l2b->l2b_kva[l2pte_index(va)]; pte = *ptep; pte = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt; *ptep = pte; PTE_SYNC(ptep); cpu_tlb_flushD_SE(va); va += PAGE_SIZE; } pmap_init_l1(l1, pl1pt); } #ifdef DEBUG printf("pmap_postinit: Allocated %d static L1 descriptor tables\n", needed); #endif } /* * This is used to stuff certain critical values into the PCB where they * can be accessed quickly from cpu_switch() et al. */ void pmap_set_pcb_pagedir(pmap_t pm, struct pcb *pcb) { struct l2_bucket *l2b; pcb->pcb_pagedir = pm->pm_l1->l1_physaddr; pcb->pcb_dacr = (DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL * 2)) | (DOMAIN_CLIENT << (pm->pm_domain * 2)); if (vector_page < KERNBASE) { pcb->pcb_pl1vec = &pm->pm_l1->l1_kva[L1_IDX(vector_page)]; l2b = pmap_get_l2_bucket(pm, vector_page); pcb->pcb_l1vec = l2b->l2b_phys | L1_C_PROTO | L1_C_DOM(pm->pm_domain) | L1_C_DOM(PMAP_DOMAIN_KERNEL); } else pcb->pcb_pl1vec = NULL; } void pmap_activate(struct thread *td) { pmap_t pm; struct pcb *pcb; pm = vmspace_pmap(td->td_proc->p_vmspace); pcb = td->td_pcb; critical_enter(); pmap_set_pcb_pagedir(pm, pcb); if (td == curthread) { u_int cur_dacr, cur_ttb; __asm __volatile("mrc p15, 0, %0, c2, c0, 0" : "=r"(cur_ttb)); __asm __volatile("mrc p15, 0, %0, c3, c0, 0" : "=r"(cur_dacr)); cur_ttb &= ~(L1_TABLE_SIZE - 1); if (cur_ttb == (u_int)pcb->pcb_pagedir && cur_dacr == pcb->pcb_dacr) { /* * No need to switch address spaces. */ critical_exit(); return; } /* * We MUST, I repeat, MUST fix up the L1 entry corresponding * to 'vector_page' in the incoming L1 table before switching * to it otherwise subsequent interrupts/exceptions (including * domain faults!) will jump into hyperspace. */ if (pcb->pcb_pl1vec) { *pcb->pcb_pl1vec = pcb->pcb_l1vec; /* * Don't need to PTE_SYNC() at this point since * cpu_setttb() is about to flush both the cache * and the TLB. */ } cpu_domains(pcb->pcb_dacr); cpu_setttb(pcb->pcb_pagedir); } critical_exit(); } static int pmap_set_pt_cache_mode(pd_entry_t *kl1, vm_offset_t va) { pd_entry_t *pdep, pde; pt_entry_t *ptep, pte; vm_offset_t pa; int rv = 0; /* * Make sure the descriptor itself has the correct cache mode */ pdep = &kl1[L1_IDX(va)]; pde = *pdep; if (l1pte_section_p(pde)) { if ((pde & L1_S_CACHE_MASK) != pte_l1_s_cache_mode_pt) { *pdep = (pde & ~L1_S_CACHE_MASK) | pte_l1_s_cache_mode_pt; PTE_SYNC(pdep); cpu_dcache_wbinv_range((vm_offset_t)pdep, sizeof(*pdep)); rv = 1; } } else { pa = (vm_paddr_t)(pde & L1_C_ADDR_MASK); ptep = (pt_entry_t *)kernel_pt_lookup(pa); if (ptep == NULL) panic("pmap_bootstrap: No L2 for L2 @ va %p\n", ptep); ptep = &ptep[l2pte_index(va)]; pte = *ptep; if ((pte & L2_S_CACHE_MASK) != pte_l2_s_cache_mode_pt) { *ptep = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt; PTE_SYNC(ptep); cpu_dcache_wbinv_range((vm_offset_t)ptep, sizeof(*ptep)); rv = 1; } } return (rv); } static void pmap_alloc_specials(vm_offset_t *availp, int pages, vm_offset_t *vap, pt_entry_t **ptep) { vm_offset_t va = *availp; struct l2_bucket *l2b; if (ptep) { l2b = pmap_get_l2_bucket(pmap_kernel(), va); if (l2b == NULL) panic("pmap_alloc_specials: no l2b for 0x%x", va); *ptep = &l2b->l2b_kva[l2pte_index(va)]; } *vap = va; *availp = va + (PAGE_SIZE * pages); } /* * Bootstrap the system enough to run with virtual memory. * * On the arm this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ #define PMAP_STATIC_L2_SIZE 16 #ifdef ARM_USE_SMALL_ALLOC extern struct mtx smallalloc_mtx; #endif void pmap_bootstrap(vm_offset_t firstaddr, vm_offset_t lastaddr, struct pv_addr *l1pt) { static struct l1_ttable static_l1; static struct l2_dtable static_l2[PMAP_STATIC_L2_SIZE]; struct l1_ttable *l1 = &static_l1; struct l2_dtable *l2; struct l2_bucket *l2b; pd_entry_t pde; pd_entry_t *kernel_l1pt = (pd_entry_t *)l1pt->pv_va; pt_entry_t *ptep; vm_paddr_t pa; vm_offset_t va; vm_size_t size; int l1idx, l2idx, l2next = 0; PDEBUG(1, printf("firstaddr = %08x, loadaddr = %08x\n", firstaddr, loadaddr)); virtual_avail = firstaddr; kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_l1 = l1; kernel_l1pa = l1pt->pv_pa; /* * Scan the L1 translation table created by initarm() and create * the required metadata for all valid mappings found in it. */ for (l1idx = 0; l1idx < (L1_TABLE_SIZE / sizeof(pd_entry_t)); l1idx++) { pde = kernel_l1pt[l1idx]; /* * We're only interested in Coarse mappings. * pmap_extract() can deal with section mappings without * recourse to checking L2 metadata. */ if ((pde & L1_TYPE_MASK) != L1_TYPE_C) continue; /* * Lookup the KVA of this L2 descriptor table */ pa = (vm_paddr_t)(pde & L1_C_ADDR_MASK); ptep = (pt_entry_t *)kernel_pt_lookup(pa); if (ptep == NULL) { panic("pmap_bootstrap: No L2 for va 0x%x, pa 0x%lx", (u_int)l1idx << L1_S_SHIFT, (long unsigned int)pa); } /* * Fetch the associated L2 metadata structure. * Allocate a new one if necessary. */ if ((l2 = kernel_pmap->pm_l2[L2_IDX(l1idx)]) == NULL) { if (l2next == PMAP_STATIC_L2_SIZE) panic("pmap_bootstrap: out of static L2s"); kernel_pmap->pm_l2[L2_IDX(l1idx)] = l2 = &static_l2[l2next++]; } /* * One more L1 slot tracked... */ l2->l2_occupancy++; /* * Fill in the details of the L2 descriptor in the * appropriate bucket. */ l2b = &l2->l2_bucket[L2_BUCKET(l1idx)]; l2b->l2b_kva = ptep; l2b->l2b_phys = pa; l2b->l2b_l1idx = l1idx; /* * Establish an initial occupancy count for this descriptor */ for (l2idx = 0; l2idx < (L2_TABLE_SIZE_REAL / sizeof(pt_entry_t)); l2idx++) { if ((ptep[l2idx] & L2_TYPE_MASK) != L2_TYPE_INV) { l2b->l2b_occupancy++; } } /* * Make sure the descriptor itself has the correct cache mode. * If not, fix it, but whine about the problem. Port-meisters * should consider this a clue to fix up their initarm() * function. :) */ if (pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)ptep)) { printf("pmap_bootstrap: WARNING! wrong cache mode for " "L2 pte @ %p\n", ptep); } } /* * Ensure the primary (kernel) L1 has the correct cache mode for * a page table. Bitch if it is not correctly set. */ for (va = (vm_offset_t)kernel_l1pt; va < ((vm_offset_t)kernel_l1pt + L1_TABLE_SIZE); va += PAGE_SIZE) { if (pmap_set_pt_cache_mode(kernel_l1pt, va)) printf("pmap_bootstrap: WARNING! wrong cache mode for " "primary L1 @ 0x%x\n", va); } cpu_dcache_wbinv_all(); cpu_tlb_flushID(); cpu_cpwait(); PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_active = -1; kernel_pmap->pm_domain = PMAP_DOMAIN_KERNEL; TAILQ_INIT(&kernel_pmap->pm_pvlist); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); pmap_alloc_specials(&virtual_avail, 1, &csrcp, &csrc_pte); pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)csrc_pte); pmap_alloc_specials(&virtual_avail, 1, &cdstp, &cdst_pte); pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)cdst_pte); size = ((lastaddr - pmap_curmaxkvaddr) + L1_S_OFFSET) / L1_S_SIZE; pmap_alloc_specials(&virtual_avail, round_page(size * L2_TABLE_SIZE_REAL) / PAGE_SIZE, &pmap_kernel_l2ptp_kva, NULL); size = (size + (L2_BUCKET_SIZE - 1)) / L2_BUCKET_SIZE; pmap_alloc_specials(&virtual_avail, round_page(size * sizeof(struct l2_dtable)) / PAGE_SIZE, &pmap_kernel_l2dtable_kva, NULL); pmap_alloc_specials(&virtual_avail, 1, (vm_offset_t*)&_tmppt, NULL); SLIST_INIT(&l1_list); TAILQ_INIT(&l1_lru_list); mtx_init(&l1_lru_lock, "l1 list lock", NULL, MTX_DEF); pmap_init_l1(l1, kernel_l1pt); cpu_dcache_wbinv_all(); virtual_avail = round_page(virtual_avail); virtual_end = lastaddr; kernel_vm_end = pmap_curmaxkvaddr; arm_nocache_startaddr = lastaddr; mtx_init(&cmtx, "TMP mappings mtx", NULL, MTX_DEF); #ifdef ARM_USE_SMALL_ALLOC mtx_init(&smallalloc_mtx, "Small alloc page list", NULL, MTX_DEF); arm_init_smallalloc(); #endif pmap_set_pcb_pagedir(kernel_pmap, thread0.td_pcb); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { struct pcb *pcb; pmap_idcache_wbinv_all(pmap); pmap_tlb_flushID(pmap); cpu_cpwait(); if (vector_page < KERNBASE) { struct pcb *curpcb = PCPU_GET(curpcb); pcb = thread0.td_pcb; if (pmap_is_current(pmap)) { /* * Frob the L1 entry corresponding to the vector * page so that it contains the kernel pmap's domain * number. This will ensure pmap_remove() does not * pull the current vector page out from under us. */ critical_enter(); *pcb->pcb_pl1vec = pcb->pcb_l1vec; cpu_domains(pcb->pcb_dacr); cpu_setttb(pcb->pcb_pagedir); critical_exit(); } pmap_remove(pmap, vector_page, vector_page + PAGE_SIZE); /* * Make sure cpu_switch(), et al, DTRT. This is safe to do * since this process has no remaining mappings of its own. */ curpcb->pcb_pl1vec = pcb->pcb_pl1vec; curpcb->pcb_l1vec = pcb->pcb_l1vec; curpcb->pcb_dacr = pcb->pcb_dacr; curpcb->pcb_pagedir = pcb->pcb_pagedir; } pmap_free_l1(pmap); PMAP_LOCK_DESTROY(pmap); dprintf("pmap_release()\n"); } /* * Helper function for pmap_grow_l2_bucket() */ static __inline int pmap_grow_map(vm_offset_t va, pt_entry_t cache_mode, vm_paddr_t *pap) { struct l2_bucket *l2b; pt_entry_t *ptep; vm_paddr_t pa; struct vm_page *pg; pg = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (pg == NULL) return (1); pa = VM_PAGE_TO_PHYS(pg); if (pap) *pap = pa; l2b = pmap_get_l2_bucket(pmap_kernel(), va); ptep = &l2b->l2b_kva[l2pte_index(va)]; *ptep = L2_S_PROTO | pa | cache_mode | L2_S_PROT(PTE_KERNEL, VM_PROT_READ | VM_PROT_WRITE); PTE_SYNC(ptep); return (0); } /* * This is the same as pmap_alloc_l2_bucket(), except that it is only * used by pmap_growkernel(). */ static __inline struct l2_bucket * pmap_grow_l2_bucket(pmap_t pm, vm_offset_t va) { struct l2_dtable *l2; struct l2_bucket *l2b; struct l1_ttable *l1; pd_entry_t *pl1pd; u_short l1idx; vm_offset_t nva; l1idx = L1_IDX(va); if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL) { /* * No mapping at this address, as there is * no entry in the L1 table. * Need to allocate a new l2_dtable. */ nva = pmap_kernel_l2dtable_kva; if ((nva & PAGE_MASK) == 0) { /* * Need to allocate a backing page */ if (pmap_grow_map(nva, pte_l2_s_cache_mode, NULL)) return (NULL); } l2 = (struct l2_dtable *)nva; nva += sizeof(struct l2_dtable); if ((nva & PAGE_MASK) < (pmap_kernel_l2dtable_kva & PAGE_MASK)) { /* * The new l2_dtable straddles a page boundary. * Map in another page to cover it. */ if (pmap_grow_map(nva, pte_l2_s_cache_mode, NULL)) return (NULL); } pmap_kernel_l2dtable_kva = nva; /* * Link it into the parent pmap */ pm->pm_l2[L2_IDX(l1idx)] = l2; memset(l2, 0, sizeof(*l2)); } l2b = &l2->l2_bucket[L2_BUCKET(l1idx)]; /* * Fetch pointer to the L2 page table associated with the address. */ if (l2b->l2b_kva == NULL) { pt_entry_t *ptep; /* * No L2 page table has been allocated. Chances are, this * is because we just allocated the l2_dtable, above. */ nva = pmap_kernel_l2ptp_kva; ptep = (pt_entry_t *)nva; if ((nva & PAGE_MASK) == 0) { /* * Need to allocate a backing page */ if (pmap_grow_map(nva, pte_l2_s_cache_mode_pt, &pmap_kernel_l2ptp_phys)) return (NULL); PTE_SYNC_RANGE(ptep, PAGE_SIZE / sizeof(pt_entry_t)); } memset(ptep, 0, L2_TABLE_SIZE_REAL); l2->l2_occupancy++; l2b->l2b_kva = ptep; l2b->l2b_l1idx = l1idx; l2b->l2b_phys = pmap_kernel_l2ptp_phys; pmap_kernel_l2ptp_kva += L2_TABLE_SIZE_REAL; pmap_kernel_l2ptp_phys += L2_TABLE_SIZE_REAL; } /* Distribute new L1 entry to all other L1s */ SLIST_FOREACH(l1, &l1_list, l1_link) { pl1pd = &l1->l1_kva[L1_IDX(va)]; *pl1pd = l2b->l2b_phys | L1_C_DOM(PMAP_DOMAIN_KERNEL) | L1_C_PROTO; PTE_SYNC(pl1pd); } return (l2b); } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { pmap_t kpm = pmap_kernel(); if (addr <= pmap_curmaxkvaddr) return; /* we are OK */ /* * whoops! we need to add kernel PTPs */ /* Map 1MB at a time */ for (; pmap_curmaxkvaddr < addr; pmap_curmaxkvaddr += L1_S_SIZE) pmap_grow_l2_bucket(kpm, pmap_curmaxkvaddr); /* * flush out the cache, expensive but growkernel will happen so * rarely */ cpu_dcache_wbinv_all(); cpu_tlb_flushD(); cpu_cpwait(); kernel_vm_end = pmap_curmaxkvaddr; } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { struct pv_entry *pv, *npv; struct l2_bucket *l2b = NULL; vm_page_t m; pt_entry_t *pt; vm_page_lock_queues(); PMAP_LOCK(pmap); for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { if (pv->pv_flags & PVF_WIRED) { /* The page is wired, cannot remove it now. */ npv = TAILQ_NEXT(pv, pv_plist); continue; } pmap->pm_stats.resident_count--; l2b = pmap_get_l2_bucket(pmap, pv->pv_va); KASSERT(l2b != NULL, ("No L2 bucket in pmap_remove_pages")); pt = &l2b->l2b_kva[l2pte_index(pv->pv_va)]; m = PHYS_TO_VM_PAGE(*pt & L2_ADDR_MASK); #ifdef ARM_USE_SMALL_ALLOC KASSERT((vm_offset_t)m >= alloc_firstaddr, ("Trying to access non-existent page va %x pte %x", pv->pv_va, *pt)); #else KASSERT((vm_offset_t)m >= KERNBASE, ("Trying to access non-existent page va %x pte %x", pv->pv_va, *pt)); #endif *pt = 0; PTE_SYNC(pt); npv = TAILQ_NEXT(pv, pv_plist); pmap_nuke_pv(m, pmap, pv); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); pmap_free_pv_entry(pv); pmap_free_l2_bucket(pmap, l2b, 1); } vm_page_unlock_queues(); cpu_idcache_wbinv_all(); cpu_tlb_flushID(); cpu_cpwait(); PMAP_UNLOCK(pmap); } /*************************************************** * Low level mapping routines..... ***************************************************/ #ifdef ARM_HAVE_SUPERSECTIONS /* Map a super section into the KVA. */ void pmap_kenter_supersection(vm_offset_t va, uint64_t pa, int flags) { pd_entry_t pd = L1_S_PROTO | L1_S_SUPERSEC | (pa & L1_SUP_FRAME) | (((pa >> 32) & 0xf) << 20) | L1_S_PROT(PTE_KERNEL, VM_PROT_READ|VM_PROT_WRITE) | L1_S_DOM(PMAP_DOMAIN_KERNEL); struct l1_ttable *l1; vm_offset_t va0, va_end; KASSERT(((va | pa) & L1_SUP_OFFSET) == 0, ("Not a valid super section mapping")); if (flags & SECTION_CACHE) pd |= pte_l1_s_cache_mode; else if (flags & SECTION_PT) pd |= pte_l1_s_cache_mode_pt; va0 = va & L1_SUP_FRAME; va_end = va + L1_SUP_SIZE; SLIST_FOREACH(l1, &l1_list, l1_link) { va = va0; for (; va < va_end; va += L1_S_SIZE) { l1->l1_kva[L1_IDX(va)] = pd; PTE_SYNC(&l1->l1_kva[L1_IDX(va)]); } } } #endif /* Map a section into the KVA. */ void pmap_kenter_section(vm_offset_t va, vm_offset_t pa, int flags) { pd_entry_t pd = L1_S_PROTO | pa | L1_S_PROT(PTE_KERNEL, VM_PROT_READ|VM_PROT_WRITE) | L1_S_DOM(PMAP_DOMAIN_KERNEL); struct l1_ttable *l1; KASSERT(((va | pa) & L1_S_OFFSET) == 0, ("Not a valid section mapping")); if (flags & SECTION_CACHE) pd |= pte_l1_s_cache_mode; else if (flags & SECTION_PT) pd |= pte_l1_s_cache_mode_pt; SLIST_FOREACH(l1, &l1_list, l1_link) { l1->l1_kva[L1_IDX(va)] = pd; PTE_SYNC(&l1->l1_kva[L1_IDX(va)]); } } /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a invltlb after doing the pmap_kenter... */ static PMAP_INLINE void pmap_kenter_internal(vm_offset_t va, vm_offset_t pa, int flags) { struct l2_bucket *l2b; pt_entry_t *pte; pt_entry_t opte; PDEBUG(1, printf("pmap_kenter: va = %08x, pa = %08x\n", (uint32_t) va, (uint32_t) pa)); l2b = pmap_get_l2_bucket(pmap_kernel(), va); if (l2b == NULL) l2b = pmap_grow_l2_bucket(pmap_kernel(), va); KASSERT(l2b != NULL, ("No L2 Bucket")); pte = &l2b->l2b_kva[l2pte_index(va)]; opte = *pte; PDEBUG(1, printf("pmap_kenter: pte = %08x, opte = %08x, npte = %08x\n", (uint32_t) pte, opte, *pte)); if (l2pte_valid(opte)) { cpu_dcache_wbinv_range(va, PAGE_SIZE); cpu_tlb_flushD_SE(va); cpu_cpwait(); } else { if (opte == 0) l2b->l2b_occupancy++; } *pte = L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, VM_PROT_READ | VM_PROT_WRITE); if (flags & KENTER_CACHE) *pte |= pte_l2_s_cache_mode; if (flags & KENTER_USER) *pte |= L2_S_PROT_U; PTE_SYNC(pte); } void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pmap_kenter_internal(va, pa, KENTER_CACHE); } void pmap_kenter_nocache(vm_offset_t va, vm_paddr_t pa) { pmap_kenter_internal(va, pa, 0); } void pmap_kenter_user(vm_offset_t va, vm_paddr_t pa) { pmap_kenter_internal(va, pa, KENTER_CACHE|KENTER_USER); /* * Call pmap_fault_fixup now, to make sure we'll have no exception * at the first use of the new address, or bad things will happen, * as we use one of these addresses in the exception handlers. */ pmap_fault_fixup(pmap_kernel(), va, VM_PROT_READ|VM_PROT_WRITE, 1); } /* * remove a page rom the kernel pagetables */ void pmap_kremove(vm_offset_t va) { struct l2_bucket *l2b; pt_entry_t *pte, opte; l2b = pmap_get_l2_bucket(pmap_kernel(), va); if (!l2b) return; KASSERT(l2b != NULL, ("No L2 Bucket")); pte = &l2b->l2b_kva[l2pte_index(va)]; opte = *pte; if (l2pte_valid(opte)) { cpu_dcache_wbinv_range(va, PAGE_SIZE); cpu_tlb_flushD_SE(va); cpu_cpwait(); *pte = 0; } } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot) { #ifdef ARM_USE_SMALL_ALLOC return (arm_ptovirt(start)); #else vm_offset_t sva = *virt; vm_offset_t va = sva; PDEBUG(1, printf("pmap_map: virt = %08x, start = %08x, end = %08x, " "prot = %d\n", (uint32_t) *virt, (uint32_t) start, (uint32_t) end, prot)); while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } *virt = va; return (sva); #endif } static void pmap_wb_page(vm_page_t m) { struct pv_entry *pv; TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) pmap_dcache_wb_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE, FALSE, (pv->pv_flags & PVF_WRITE) == 0); } static void pmap_inv_page(vm_page_t m) { struct pv_entry *pv; TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) pmap_dcache_wb_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE, TRUE, TRUE); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(vm_offset_t va, vm_page_t *m, int count) { int i; for (i = 0; i < count; i++) { pmap_wb_page(m[i]); pmap_kenter_internal(va, VM_PAGE_TO_PHYS(m[i]), KENTER_CACHE); va += PAGE_SIZE; } } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t va, int count) { vm_paddr_t pa; int i; for (i = 0; i < count; i++) { pa = vtophys(va); if (pa) { pmap_inv_page(PHYS_TO_VM_PAGE(pa)); pmap_kremove(va); } va += PAGE_SIZE; } } /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT(object->type == OBJT_DEVICE, ("pmap_object_init_pt: non-device object")); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; if (!pmap_get_pde_pte(pmap, addr, &pde, &pte)) return (FALSE); KASSERT(pte != NULL, ("Valid mapping but no pte ?")); if (*pte == 0) return (TRUE); return (FALSE); } /* * Fetch pointers to the PDE/PTE for the given pmap/VA pair. * Returns TRUE if the mapping exists, else FALSE. * * NOTE: This function is only used by a couple of arm-specific modules. * It is not safe to take any pmap locks here, since we could be right * in the middle of debugging the pmap anyway... * * It is possible for this routine to return FALSE even though a valid * mapping does exist. This is because we don't lock, so the metadata * state may be inconsistent. * * NOTE: We can return a NULL *ptp in the case where the L1 pde is * a "section" mapping. */ boolean_t pmap_get_pde_pte(pmap_t pm, vm_offset_t va, pd_entry_t **pdp, pt_entry_t **ptp) { struct l2_dtable *l2; pd_entry_t *pl1pd, l1pd; pt_entry_t *ptep; u_short l1idx; if (pm->pm_l1 == NULL) return (FALSE); l1idx = L1_IDX(va); *pdp = pl1pd = &pm->pm_l1->l1_kva[l1idx]; l1pd = *pl1pd; if (l1pte_section_p(l1pd)) { *ptp = NULL; return (TRUE); } if (pm->pm_l2 == NULL) return (FALSE); l2 = pm->pm_l2[L2_IDX(l1idx)]; if (l2 == NULL || (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) { return (FALSE); } *ptp = &ptep[l2pte_index(va)]; return (TRUE); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { pv_entry_t pv; pt_entry_t *ptep, pte; struct l2_bucket *l2b; boolean_t flush = FALSE; pmap_t curpm; int flags = 0; #if defined(PMAP_DEBUG) /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (m->flags & PG_FICTITIOUS) { panic("pmap_page_protect: illegal for unmanaged page, va: 0x%x", VM_PAGE_TO_PHYS(m)); } #endif if (TAILQ_EMPTY(&m->md.pv_list)) return; mtx_assert(&vm_page_queue_mtx, MA_OWNED); curpm = vmspace_pmap(curproc->p_vmspace); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { if (flush == FALSE && (pv->pv_pmap == curpm || pv->pv_pmap == pmap_kernel())) flush = TRUE; PMAP_LOCK(pv->pv_pmap); l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va); KASSERT(l2b != NULL, ("No l2 bucket")); ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)]; pte = *ptep; *ptep = 0; PTE_SYNC_CURRENT(pv->pv_pmap, ptep); pmap_free_l2_bucket(pv->pv_pmap, l2b, 1); if (pv->pv_flags & PVF_WIRED) pv->pv_pmap->pm_stats.wired_count--; pv->pv_pmap->pm_stats.resident_count--; flags |= pv->pv_flags; pmap_nuke_pv(m, pv->pv_pmap, pv); PMAP_UNLOCK(pv->pv_pmap); pmap_free_pv_entry(pv); } if (flush) { if (PV_BEEN_EXECD(flags)) pmap_tlb_flushID(curpm); else pmap_tlb_flushD(curpm); } vm_page_flag_clear(m, PG_WRITEABLE); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { struct l2_bucket *l2b; pt_entry_t *ptep, pte; vm_offset_t next_bucket; u_int flags; int flush; if ((prot & VM_PROT_READ) == 0) { pmap_remove(pm, sva, eva); return; } if (prot & VM_PROT_WRITE) { /* * If this is a read->write transition, just ignore it and let * vm_fault() take care of it later. */ return; } vm_page_lock_queues(); PMAP_LOCK(pm); /* * OK, at this point, we know we're doing write-protect operation. * If the pmap is active, write-back the range. */ pmap_dcache_wb_range(pm, sva, eva - sva, FALSE, FALSE); flush = ((eva - sva) >= (PAGE_SIZE * 4)) ? 0 : -1; flags = 0; while (sva < eva) { next_bucket = L2_NEXT_BUCKET(sva); if (next_bucket > eva) next_bucket = eva; l2b = pmap_get_l2_bucket(pm, sva); if (l2b == NULL) { sva = next_bucket; continue; } ptep = &l2b->l2b_kva[l2pte_index(sva)]; while (sva < next_bucket) { if ((pte = *ptep) != 0 && (pte & L2_S_PROT_W) != 0) { struct vm_page *pg; u_int f; pg = PHYS_TO_VM_PAGE(l2pte_pa(pte)); pte &= ~L2_S_PROT_W; *ptep = pte; PTE_SYNC(ptep); if (pg != NULL) { f = pmap_modify_pv(pg, pm, sva, PVF_WRITE, 0); pmap_vac_me_harder(pg, pm, sva); vm_page_dirty(pg); } else f = PVF_REF | PVF_EXEC; if (flush >= 0) { flush++; flags |= f; } else if (PV_BEEN_EXECD(f)) pmap_tlb_flushID_SE(pm, sva); else if (PV_BEEN_REFD(f)) pmap_tlb_flushD_SE(pm, sva); } sva += PAGE_SIZE; ptep++; } } if (flush) { if (PV_BEEN_EXECD(flags)) pmap_tlb_flushID(pm); else if (PV_BEEN_REFD(flags)) pmap_tlb_flushD(pm); } vm_page_unlock_queues(); PMAP_UNLOCK(pm); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { vm_page_lock_queues(); PMAP_LOCK(pmap); pmap_enter_locked(pmap, va, m, prot, wired, M_WAITOK); vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } /* * The page queues and pmap must be locked. */ static void pmap_enter_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired, int flags) { struct l2_bucket *l2b = NULL; struct vm_page *opg; struct pv_entry *pve = NULL; pt_entry_t *ptep, npte, opte; u_int nflags; u_int oflags; vm_paddr_t pa; PMAP_ASSERT_LOCKED(pmap); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (va == vector_page) { pa = systempage.pv_pa; m = NULL; } else pa = VM_PAGE_TO_PHYS(m); nflags = 0; if (prot & VM_PROT_WRITE) nflags |= PVF_WRITE; if (prot & VM_PROT_EXECUTE) nflags |= PVF_EXEC; if (wired) nflags |= PVF_WIRED; PDEBUG(1, printf("pmap_enter: pmap = %08x, va = %08x, m = %08x, prot = %x, " "wired = %x\n", (uint32_t) pmap, va, (uint32_t) m, prot, wired)); if (pmap == pmap_kernel()) { l2b = pmap_get_l2_bucket(pmap, va); if (l2b == NULL) l2b = pmap_grow_l2_bucket(pmap, va); } else { do_l2b_alloc: l2b = pmap_alloc_l2_bucket(pmap, va); if (l2b == NULL) { if (flags & M_WAITOK) { PMAP_UNLOCK(pmap); vm_page_unlock_queues(); VM_WAIT; vm_page_lock_queues(); PMAP_LOCK(pmap); goto do_l2b_alloc; } return; } } ptep = &l2b->l2b_kva[l2pte_index(va)]; opte = *ptep; npte = pa; oflags = 0; if (opte) { /* * There is already a mapping at this address. * If the physical address is different, lookup the * vm_page. */ if (l2pte_pa(opte) != pa) opg = PHYS_TO_VM_PAGE(l2pte_pa(opte)); else opg = m; } else opg = NULL; if ((prot & (VM_PROT_ALL)) || (!m || m->md.pvh_attrs & PVF_REF)) { /* * - The access type indicates that we don't need * to do referenced emulation. * OR * - The physical page has already been referenced * so no need to re-do referenced emulation here. */ npte |= L2_S_PROTO; nflags |= PVF_REF; if (m && ((prot & VM_PROT_WRITE) != 0 || (m->md.pvh_attrs & PVF_MOD))) { /* * This is a writable mapping, and the * page's mod state indicates it has * already been modified. Make it * writable from the outset. */ nflags |= PVF_MOD; if (!(m->md.pvh_attrs & PVF_MOD)) vm_page_dirty(m); } if (m && opte) vm_page_flag_set(m, PG_REFERENCED); } else { /* * Need to do page referenced emulation. */ npte |= L2_TYPE_INV; } if (prot & VM_PROT_WRITE) { npte |= L2_S_PROT_W; if (m != NULL) vm_page_flag_set(m, PG_WRITEABLE); } npte |= pte_l2_s_cache_mode; if (m && m == opg) { /* * We're changing the attrs of an existing mapping. */ oflags = pmap_modify_pv(m, pmap, va, PVF_WRITE | PVF_EXEC | PVF_WIRED | PVF_MOD | PVF_REF, nflags); /* * We may need to flush the cache if we're * doing rw-ro... */ if (pmap_is_current(pmap) && (oflags & PVF_NC) == 0 && (opte & L2_S_PROT_W) != 0 && (prot & VM_PROT_WRITE) == 0) cpu_dcache_wb_range(va, PAGE_SIZE); } else { /* * New mapping, or changing the backing page * of an existing mapping. */ if (opg) { /* * Replacing an existing mapping with a new one. * It is part of our managed memory so we * must remove it from the PV list */ pve = pmap_remove_pv(opg, pmap, va); if (m && (m->flags & (PG_UNMANAGED | PG_FICTITIOUS)) && pve) pmap_free_pv_entry(pve); else if (!pve && !(m->flags & (PG_UNMANAGED | PG_FICTITIOUS))) pve = pmap_get_pv_entry(); KASSERT(pve != NULL || m->flags & (PG_UNMANAGED | PG_FICTITIOUS), ("No pv")); oflags = pve->pv_flags; /* * If the old mapping was valid (ref/mod * emulation creates 'invalid' mappings * initially) then make sure to frob * the cache. */ if ((oflags & PVF_NC) == 0 && l2pte_valid(opte)) { if (PV_BEEN_EXECD(oflags)) { pmap_idcache_wbinv_range(pmap, va, PAGE_SIZE); } else if (PV_BEEN_REFD(oflags)) { pmap_dcache_wb_range(pmap, va, PAGE_SIZE, TRUE, (oflags & PVF_WRITE) == 0); } } } else if (m && !(m->flags & (PG_UNMANAGED | PG_FICTITIOUS))) if ((pve = pmap_get_pv_entry()) == NULL) { panic("pmap_enter: no pv entries"); } if (m && !(m->flags & (PG_UNMANAGED | PG_FICTITIOUS))) { KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); pmap_enter_pv(m, pve, pmap, va, nflags); } } /* * Make sure userland mappings get the right permissions */ if (pmap != pmap_kernel() && va != vector_page) { npte |= L2_S_PROT_U; } /* * Keep the stats up to date */ if (opte == 0) { l2b->l2b_occupancy++; pmap->pm_stats.resident_count++; } /* * If this is just a wiring change, the two PTEs will be * identical, so there's no need to update the page table. */ if (npte != opte) { boolean_t is_cached = pmap_is_current(pmap); *ptep = npte; if (is_cached) { /* * We only need to frob the cache/tlb if this pmap * is current */ PTE_SYNC(ptep); if (L1_IDX(va) != L1_IDX(vector_page) && l2pte_valid(npte)) { /* * This mapping is likely to be accessed as * soon as we return to userland. Fix up the * L1 entry to avoid taking another * page/domain fault. */ pd_entry_t *pl1pd, l1pd; pl1pd = &pmap->pm_l1->l1_kva[L1_IDX(va)]; l1pd = l2b->l2b_phys | L1_C_DOM(pmap->pm_domain) | L1_C_PROTO; if (*pl1pd != l1pd) { *pl1pd = l1pd; PTE_SYNC(pl1pd); } } } if (PV_BEEN_EXECD(oflags)) pmap_tlb_flushID_SE(pmap, va); else if (PV_BEEN_REFD(oflags)) pmap_tlb_flushD_SE(pmap, va); if (m) pmap_vac_me_harder(m, pmap, va); } } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; psize = atop(end - start); m = m_start; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { pmap_enter_locked(pmap, start + ptoa(diff), m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), FALSE, M_NOWAIT); m = TAILQ_NEXT(m, listq); } PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { PMAP_LOCK(pmap); pmap_enter_locked(pmap, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), FALSE, M_NOWAIT); PMAP_UNLOCK(pmap); } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) { struct l2_bucket *l2b; pt_entry_t *ptep, pte; vm_page_t pg; vm_page_lock_queues(); PMAP_LOCK(pmap); l2b = pmap_get_l2_bucket(pmap, va); KASSERT(l2b, ("No l2b bucket in pmap_change_wiring")); ptep = &l2b->l2b_kva[l2pte_index(va)]; pte = *ptep; pg = PHYS_TO_VM_PAGE(l2pte_pa(pte)); if (pg) pmap_modify_pv(pg, pmap, va, PVF_WIRED, wired); vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pm, vm_offset_t va) { struct l2_dtable *l2; pd_entry_t l1pd; pt_entry_t *ptep, pte; vm_paddr_t pa; u_int l1idx; l1idx = L1_IDX(va); PMAP_LOCK(pm); l1pd = pm->pm_l1->l1_kva[l1idx]; if (l1pte_section_p(l1pd)) { /* * These should only happen for pmap_kernel() */ KASSERT(pm == pmap_kernel(), ("huh")); /* XXX: what to do about the bits > 32 ? */ if (l1pd & L1_S_SUPERSEC) pa = (l1pd & L1_SUP_FRAME) | (va & L1_SUP_OFFSET); else pa = (l1pd & L1_S_FRAME) | (va & L1_S_OFFSET); } else { /* * Note that we can't rely on the validity of the L1 * descriptor as an indication that a mapping exists. * We have to look it up in the L2 dtable. */ l2 = pm->pm_l2[L2_IDX(l1idx)]; if (l2 == NULL || (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) { PMAP_UNLOCK(pm); return (0); } ptep = &ptep[l2pte_index(va)]; pte = *ptep; if (pte == 0) { PMAP_UNLOCK(pm); return (0); } switch (pte & L2_TYPE_MASK) { case L2_TYPE_L: pa = (pte & L2_L_FRAME) | (va & L2_L_OFFSET); break; default: pa = (pte & L2_S_FRAME) | (va & L2_S_OFFSET); break; } } PMAP_UNLOCK(pm); return (pa); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. * */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { struct l2_dtable *l2; pd_entry_t l1pd; pt_entry_t *ptep, pte; vm_paddr_t pa; vm_page_t m = NULL; u_int l1idx; l1idx = L1_IDX(va); vm_page_lock_queues(); PMAP_LOCK(pmap); l1pd = pmap->pm_l1->l1_kva[l1idx]; if (l1pte_section_p(l1pd)) { /* * These should only happen for pmap_kernel() */ KASSERT(pmap == pmap_kernel(), ("huh")); /* XXX: what to do about the bits > 32 ? */ if (l1pd & L1_S_SUPERSEC) pa = (l1pd & L1_SUP_FRAME) | (va & L1_SUP_OFFSET); else pa = (l1pd & L1_S_FRAME) | (va & L1_S_OFFSET); if (l1pd & L1_S_PROT_W || (prot & VM_PROT_WRITE) == 0) { m = PHYS_TO_VM_PAGE(pa); vm_page_hold(m); } } else { /* * Note that we can't rely on the validity of the L1 * descriptor as an indication that a mapping exists. * We have to look it up in the L2 dtable. */ l2 = pmap->pm_l2[L2_IDX(l1idx)]; if (l2 == NULL || (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) { PMAP_UNLOCK(pmap); vm_page_unlock_queues(); return (NULL); } ptep = &ptep[l2pte_index(va)]; pte = *ptep; if (pte == 0) { PMAP_UNLOCK(pmap); vm_page_unlock_queues(); return (NULL); } if (pte & L2_S_PROT_W || (prot & VM_PROT_WRITE) == 0) { switch (pte & L2_TYPE_MASK) { case L2_TYPE_L: pa = (pte & L2_L_FRAME) | (va & L2_L_OFFSET); break; default: pa = (pte & L2_S_FRAME) | (va & L2_S_OFFSET); break; } m = PHYS_TO_VM_PAGE(pa); vm_page_hold(m); } } PMAP_UNLOCK(pmap); vm_page_unlock_queues(); return (m); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ -void +int pmap_pinit(pmap_t pmap) { PDEBUG(1, printf("pmap_pinit: pmap = %08x\n", (uint32_t) pmap)); PMAP_LOCK_INIT(pmap); pmap_alloc_l1(pmap); bzero(pmap->pm_l2, sizeof(pmap->pm_l2)); pmap->pm_count = 1; pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_stats.resident_count = 1; if (vector_page < KERNBASE) { pmap_enter(pmap, vector_page, PHYS_TO_VM_PAGE(systempage.pv_pa), VM_PROT_READ, 1); } + return (1); } /*************************************************** * page management routines. ***************************************************/ static void pmap_free_pv_entry(pv_entry_t pv) { pv_entry_count--; uma_zfree(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t pmap_get_pv_entry(void) { pv_entry_t ret_value; pv_entry_count++; if (pv_entry_count > pv_entry_high_water) pagedaemon_wakeup(); ret_value = uma_zalloc(pvzone, M_NOWAIT); return ret_value; } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ #define PMAP_REMOVE_CLEAN_LIST_SIZE 3 void pmap_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct l2_bucket *l2b; vm_offset_t next_bucket; pt_entry_t *ptep; u_int cleanlist_idx, total, cnt; struct { vm_offset_t va; pt_entry_t *pte; } cleanlist[PMAP_REMOVE_CLEAN_LIST_SIZE]; u_int mappings, is_exec, is_refd; int flushall = 0; /* * we lock in the pmap => pv_head direction */ vm_page_lock_queues(); PMAP_LOCK(pm); if (!pmap_is_current(pm)) { cleanlist_idx = PMAP_REMOVE_CLEAN_LIST_SIZE + 1; } else cleanlist_idx = 0; total = 0; while (sva < eva) { /* * Do one L2 bucket's worth at a time. */ next_bucket = L2_NEXT_BUCKET(sva); if (next_bucket > eva) next_bucket = eva; l2b = pmap_get_l2_bucket(pm, sva); if (l2b == NULL) { sva = next_bucket; continue; } ptep = &l2b->l2b_kva[l2pte_index(sva)]; mappings = 0; while (sva < next_bucket) { struct vm_page *pg; pt_entry_t pte; vm_paddr_t pa; pte = *ptep; if (pte == 0) { /* * Nothing here, move along */ sva += PAGE_SIZE; ptep++; continue; } pm->pm_stats.resident_count--; pa = l2pte_pa(pte); is_exec = 0; is_refd = 1; /* * Update flags. In a number of circumstances, * we could cluster a lot of these and do a * number of sequential pages in one go. */ if ((pg = PHYS_TO_VM_PAGE(pa)) != NULL) { struct pv_entry *pve; pve = pmap_remove_pv(pg, pm, sva); if (pve) { is_exec = PV_BEEN_EXECD(pve->pv_flags); is_refd = PV_BEEN_REFD(pve->pv_flags); pmap_free_pv_entry(pve); } } if (!l2pte_valid(pte)) { *ptep = 0; PTE_SYNC_CURRENT(pm, ptep); sva += PAGE_SIZE; ptep++; mappings++; continue; } if (cleanlist_idx < PMAP_REMOVE_CLEAN_LIST_SIZE) { /* Add to the clean list. */ cleanlist[cleanlist_idx].pte = ptep; cleanlist[cleanlist_idx].va = sva | (is_exec & 1); cleanlist_idx++; } else if (cleanlist_idx == PMAP_REMOVE_CLEAN_LIST_SIZE) { /* Nuke everything if needed. */ pmap_idcache_wbinv_all(pm); pmap_tlb_flushID(pm); /* * Roll back the previous PTE list, * and zero out the current PTE. */ for (cnt = 0; cnt < PMAP_REMOVE_CLEAN_LIST_SIZE; cnt++) { *cleanlist[cnt].pte = 0; } *ptep = 0; PTE_SYNC(ptep); cleanlist_idx++; flushall = 1; } else { *ptep = 0; PTE_SYNC(ptep); if (is_exec) pmap_tlb_flushID_SE(pm, sva); else if (is_refd) pmap_tlb_flushD_SE(pm, sva); } sva += PAGE_SIZE; ptep++; mappings++; } /* * Deal with any left overs */ if (cleanlist_idx <= PMAP_REMOVE_CLEAN_LIST_SIZE) { total += cleanlist_idx; for (cnt = 0; cnt < cleanlist_idx; cnt++) { vm_offset_t clva = cleanlist[cnt].va & ~1; if (cleanlist[cnt].va & 1) { pmap_idcache_wbinv_range(pm, clva, PAGE_SIZE); pmap_tlb_flushID_SE(pm, clva); } else { pmap_dcache_wb_range(pm, clva, PAGE_SIZE, TRUE, FALSE); pmap_tlb_flushD_SE(pm, clva); } *cleanlist[cnt].pte = 0; PTE_SYNC_CURRENT(pm, cleanlist[cnt].pte); } if (total <= PMAP_REMOVE_CLEAN_LIST_SIZE) cleanlist_idx = 0; else { /* * We are removing so much entries it's just * easier to flush the whole cache. */ cleanlist_idx = PMAP_REMOVE_CLEAN_LIST_SIZE + 1; pmap_idcache_wbinv_all(pm); flushall = 1; } } pmap_free_l2_bucket(pm, l2b, mappings); } vm_page_unlock_queues(); if (flushall) cpu_tlb_flushID(); PMAP_UNLOCK(pm); } /* * pmap_zero_page() * * Zero a given physical page by mapping it at a page hook point. * In doing the zero page op, the page we zero is mapped cachable, as with * StrongARM accesses to non-cached pages are non-burst making writing * _any_ bulk data very slow. */ #if (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 || defined(CPU_XSCALE_CORE3) void pmap_zero_page_generic(vm_paddr_t phys, int off, int size) { #ifdef ARM_USE_SMALL_ALLOC char *dstpg; #endif #ifdef DEBUG struct vm_page *pg = PHYS_TO_VM_PAGE(phys); if (pg->md.pvh_list != NULL) panic("pmap_zero_page: page has mappings"); #endif if (_arm_bzero && size >= _min_bzero_size && _arm_bzero((void *)(phys + off), size, IS_PHYSICAL) == 0) return; #ifdef ARM_USE_SMALL_ALLOC dstpg = (char *)arm_ptovirt(phys); if (off || size != PAGE_SIZE) { bzero(dstpg + off, size); cpu_dcache_wbinv_range((vm_offset_t)(dstpg + off), size); } else { bzero_page((vm_offset_t)dstpg); cpu_dcache_wbinv_range((vm_offset_t)dstpg, PAGE_SIZE); } #else mtx_lock(&cmtx); /* * Hook in the page, zero it, and purge the cache for that * zeroed page. Invalidate the TLB as needed. */ *cdst_pte = L2_S_PROTO | phys | L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | pte_l2_s_cache_mode; PTE_SYNC(cdst_pte); cpu_tlb_flushD_SE(cdstp); cpu_cpwait(); if (off || size != PAGE_SIZE) { bzero((void *)(cdstp + off), size); cpu_dcache_wbinv_range(cdstp + off, size); } else { bzero_page(cdstp); cpu_dcache_wbinv_range(cdstp, PAGE_SIZE); } mtx_unlock(&cmtx); #endif } #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */ #if ARM_MMU_XSCALE == 1 void pmap_zero_page_xscale(vm_paddr_t phys, int off, int size) { #ifdef ARM_USE_SMALL_ALLOC char *dstpg; #endif if (_arm_bzero && size >= _min_bzero_size && _arm_bzero((void *)(phys + off), size, IS_PHYSICAL) == 0) return; #ifdef ARM_USE_SMALL_ALLOC dstpg = (char *)arm_ptovirt(phys); if (off || size != PAGE_SIZE) { bzero(dstpg + off, size); cpu_dcache_wbinv_range((vm_offset_t)(dstpg + off), size); } else { bzero_page((vm_offset_t)dstpg); cpu_dcache_wbinv_range((vm_offset_t)dstpg, PAGE_SIZE); } #else mtx_lock(&cmtx); /* * Hook in the page, zero it, and purge the cache for that * zeroed page. Invalidate the TLB as needed. */ *cdst_pte = L2_S_PROTO | phys | L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X); /* mini-data */ PTE_SYNC(cdst_pte); cpu_tlb_flushD_SE(cdstp); cpu_cpwait(); if (off || size != PAGE_SIZE) bzero((void *)(cdstp + off), size); else bzero_page(cdstp); mtx_unlock(&cmtx); xscale_cache_clean_minidata(); #endif } /* * Change the PTEs for the specified kernel mappings such that they * will use the mini data cache instead of the main data cache. */ void pmap_use_minicache(vm_offset_t va, vm_size_t size) { struct l2_bucket *l2b; pt_entry_t *ptep, *sptep, pte; vm_offset_t next_bucket, eva; #if (ARM_NMMUS > 1) || defined(CPU_XSCALE_CORE3) if (xscale_use_minidata == 0) return; #endif eva = va + size; while (va < eva) { next_bucket = L2_NEXT_BUCKET(va); if (next_bucket > eva) next_bucket = eva; l2b = pmap_get_l2_bucket(pmap_kernel(), va); sptep = ptep = &l2b->l2b_kva[l2pte_index(va)]; while (va < next_bucket) { pte = *ptep; if (!l2pte_minidata(pte)) { cpu_dcache_wbinv_range(va, PAGE_SIZE); cpu_tlb_flushD_SE(va); *ptep = pte & ~L2_B; } ptep++; va += PAGE_SIZE; } PTE_SYNC_RANGE(sptep, (u_int)(ptep - sptep)); } cpu_cpwait(); } #endif /* ARM_MMU_XSCALE == 1 */ /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { pmap_zero_page_func(VM_PAGE_TO_PHYS(m), 0, PAGE_SIZE); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { pmap_zero_page_func(VM_PAGE_TO_PHYS(m), off, size); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { pmap_zero_page(m); } #if 0 /* * pmap_clean_page() * * This is a local function used to work out the best strategy to clean * a single page referenced by its entry in the PV table. It's used by * pmap_copy_page, pmap_zero page and maybe some others later on. * * Its policy is effectively: * o If there are no mappings, we don't bother doing anything with the cache. * o If there is one mapping, we clean just that page. * o If there are multiple mappings, we clean the entire cache. * * So that some functions can be further optimised, it returns 0 if it didn't * clean the entire cache, or 1 if it did. * * XXX One bug in this routine is that if the pv_entry has a single page * mapped at 0x00000000 a whole cache clean will be performed rather than * just the 1 page. Since this should not occur in everyday use and if it does * it will just result in not the most efficient clean for the page. */ static int pmap_clean_page(struct pv_entry *pv, boolean_t is_src) { pmap_t pm, pm_to_clean = NULL; struct pv_entry *npv; u_int cache_needs_cleaning = 0; u_int flags = 0; vm_offset_t page_to_clean = 0; if (pv == NULL) { /* nothing mapped in so nothing to flush */ return (0); } /* * Since we flush the cache each time we change to a different * user vmspace, we only need to flush the page if it is in the * current pmap. */ if (curthread) pm = vmspace_pmap(curproc->p_vmspace); else pm = pmap_kernel(); for (npv = pv; npv; npv = TAILQ_NEXT(npv, pv_list)) { if (npv->pv_pmap == pmap_kernel() || npv->pv_pmap == pm) { flags |= npv->pv_flags; /* * The page is mapped non-cacheable in * this map. No need to flush the cache. */ if (npv->pv_flags & PVF_NC) { #ifdef DIAGNOSTIC if (cache_needs_cleaning) panic("pmap_clean_page: " "cache inconsistency"); #endif break; } else if (is_src && (npv->pv_flags & PVF_WRITE) == 0) continue; if (cache_needs_cleaning) { page_to_clean = 0; break; } else { page_to_clean = npv->pv_va; pm_to_clean = npv->pv_pmap; } cache_needs_cleaning = 1; } } if (page_to_clean) { if (PV_BEEN_EXECD(flags)) pmap_idcache_wbinv_range(pm_to_clean, page_to_clean, PAGE_SIZE); else pmap_dcache_wb_range(pm_to_clean, page_to_clean, PAGE_SIZE, !is_src, (flags & PVF_WRITE) == 0); } else if (cache_needs_cleaning) { if (PV_BEEN_EXECD(flags)) pmap_idcache_wbinv_all(pm); else pmap_dcache_wbinv_all(pm); return (1); } return (0); } #endif /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ /* * pmap_copy_page() * * Copy one physical page into another, by mapping the pages into * hook points. The same comment regarding cachability as in * pmap_zero_page also applies here. */ #if (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 || defined (CPU_XSCALE_CORE3) void pmap_copy_page_generic(vm_paddr_t src, vm_paddr_t dst) { #if 0 struct vm_page *src_pg = PHYS_TO_VM_PAGE(src); #endif #ifdef DEBUG struct vm_page *dst_pg = PHYS_TO_VM_PAGE(dst); if (dst_pg->md.pvh_list != NULL) panic("pmap_copy_page: dst page has mappings"); #endif /* * Clean the source page. Hold the source page's lock for * the duration of the copy so that no other mappings can * be created while we have a potentially aliased mapping. */ #if 0 /* * XXX: Not needed while we call cpu_dcache_wbinv_all() in * pmap_copy_page(). */ (void) pmap_clean_page(TAILQ_FIRST(&src_pg->md.pv_list), TRUE); #endif /* * Map the pages into the page hook points, copy them, and purge * the cache for the appropriate page. Invalidate the TLB * as required. */ mtx_lock(&cmtx); *csrc_pte = L2_S_PROTO | src | L2_S_PROT(PTE_KERNEL, VM_PROT_READ) | pte_l2_s_cache_mode; PTE_SYNC(csrc_pte); *cdst_pte = L2_S_PROTO | dst | L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | pte_l2_s_cache_mode; PTE_SYNC(cdst_pte); cpu_tlb_flushD_SE(csrcp); cpu_tlb_flushD_SE(cdstp); cpu_cpwait(); bcopy_page(csrcp, cdstp); mtx_unlock(&cmtx); cpu_dcache_inv_range(csrcp, PAGE_SIZE); cpu_dcache_wbinv_range(cdstp, PAGE_SIZE); } #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */ #if ARM_MMU_XSCALE == 1 void pmap_copy_page_xscale(vm_paddr_t src, vm_paddr_t dst) { #if 0 /* XXX: Only needed for pmap_clean_page(), which is commented out. */ struct vm_page *src_pg = PHYS_TO_VM_PAGE(src); #endif #ifdef DEBUG struct vm_page *dst_pg = PHYS_TO_VM_PAGE(dst); if (dst_pg->md.pvh_list != NULL) panic("pmap_copy_page: dst page has mappings"); #endif /* * Clean the source page. Hold the source page's lock for * the duration of the copy so that no other mappings can * be created while we have a potentially aliased mapping. */ #if 0 /* * XXX: Not needed while we call cpu_dcache_wbinv_all() in * pmap_copy_page(). */ (void) pmap_clean_page(TAILQ_FIRST(&src_pg->md.pv_list), TRUE); #endif /* * Map the pages into the page hook points, copy them, and purge * the cache for the appropriate page. Invalidate the TLB * as required. */ mtx_lock(&cmtx); *csrc_pte = L2_S_PROTO | src | L2_S_PROT(PTE_KERNEL, VM_PROT_READ) | L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X); /* mini-data */ PTE_SYNC(csrc_pte); *cdst_pte = L2_S_PROTO | dst | L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X); /* mini-data */ PTE_SYNC(cdst_pte); cpu_tlb_flushD_SE(csrcp); cpu_tlb_flushD_SE(cdstp); cpu_cpwait(); bcopy_page(csrcp, cdstp); mtx_unlock(&cmtx); xscale_cache_clean_minidata(); } #endif /* ARM_MMU_XSCALE == 1 */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { #ifdef ARM_USE_SMALL_ALLOC vm_offset_t srcpg, dstpg; #endif cpu_dcache_wbinv_all(); if (_arm_memcpy && PAGE_SIZE >= _min_memcpy_size && _arm_memcpy((void *)VM_PAGE_TO_PHYS(dst), (void *)VM_PAGE_TO_PHYS(src), PAGE_SIZE, IS_PHYSICAL) == 0) return; #ifdef ARM_USE_SMALL_ALLOC srcpg = arm_ptovirt(VM_PAGE_TO_PHYS(src)); dstpg = arm_ptovirt(VM_PAGE_TO_PHYS(dst)); bcopy_page(srcpg, dstpg); cpu_dcache_wbinv_range(dstpg, PAGE_SIZE); #else pmap_copy_page_func(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst)); #endif } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { pv_entry_t pv; int loops = 0; if (m->flags & PG_FICTITIOUS) return (FALSE); /* * Not found, check current mappings returning immediately */ for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { if (pv->pv_pmap == pmap) { return (TRUE); } loops++; if (loops >= 16) break; } return (FALSE); } /* * pmap_ts_referenced: * * Return the count of reference bits for a page, clearing all of them. */ int pmap_ts_referenced(vm_page_t m) { if (m->flags & PG_FICTITIOUS) return (0); return (pmap_clearbit(m, PVF_REF)); } boolean_t pmap_is_modified(vm_page_t m) { if (m->md.pvh_attrs & PVF_MOD) return (TRUE); return(FALSE); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { if (m->md.pvh_attrs & PVF_MOD) pmap_clearbit(m, PVF_MOD); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_page_t m) { if (m->md.pvh_attrs & PVF_REF) pmap_clearbit(m, PVF_REF); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { if (m->flags & PG_WRITEABLE) pmap_clearbit(m, PVF_WRITE); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr) { printf("pmap_mincore()\n"); return (0); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { return(addr); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(vm_offset_t pa, vm_size_t size) { vm_offset_t va, tmpva, offset; offset = pa & PAGE_MASK; size = roundup(size, PAGE_SIZE); GIANT_REQUIRED; va = kmem_alloc_nofault(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0;) { pmap_kenter_internal(tmpva, pa, 0); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } return ((void *)(va + offset)); } #define BOOTSTRAP_DEBUG /* * pmap_map_section: * * Create a single section mapping. */ void pmap_map_section(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa, int prot, int cache) { pd_entry_t *pde = (pd_entry_t *) l1pt; pd_entry_t fl; KASSERT(((va | pa) & L1_S_OFFSET) == 0, ("ouin2")); switch (cache) { case PTE_NOCACHE: default: fl = 0; break; case PTE_CACHE: fl = pte_l1_s_cache_mode; break; case PTE_PAGETABLE: fl = pte_l1_s_cache_mode_pt; break; } pde[va >> L1_S_SHIFT] = L1_S_PROTO | pa | L1_S_PROT(PTE_KERNEL, prot) | fl | L1_S_DOM(PMAP_DOMAIN_KERNEL); PTE_SYNC(&pde[va >> L1_S_SHIFT]); } /* * pmap_link_l2pt: * * Link the L2 page table specified by l2pv.pv_pa into the L1 * page table at the slot for "va". */ void pmap_link_l2pt(vm_offset_t l1pt, vm_offset_t va, struct pv_addr *l2pv) { pd_entry_t *pde = (pd_entry_t *) l1pt, proto; u_int slot = va >> L1_S_SHIFT; proto = L1_S_DOM(PMAP_DOMAIN_KERNEL) | L1_C_PROTO; #ifdef VERBOSE_INIT_ARM printf("pmap_link_l2pt: pa=0x%x va=0x%x\n", l2pv->pv_pa, l2pv->pv_va); #endif pde[slot + 0] = proto | (l2pv->pv_pa + 0x000); PTE_SYNC(&pde[slot]); SLIST_INSERT_HEAD(&kernel_pt_list, l2pv, pv_list); } /* * pmap_map_entry * * Create a single page mapping. */ void pmap_map_entry(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa, int prot, int cache) { pd_entry_t *pde = (pd_entry_t *) l1pt; pt_entry_t fl; pt_entry_t *pte; KASSERT(((va | pa) & PAGE_MASK) == 0, ("ouin")); switch (cache) { case PTE_NOCACHE: default: fl = 0; break; case PTE_CACHE: fl = pte_l2_s_cache_mode; break; case PTE_PAGETABLE: fl = pte_l2_s_cache_mode_pt; break; } if ((pde[va >> L1_S_SHIFT] & L1_TYPE_MASK) != L1_TYPE_C) panic("pmap_map_entry: no L2 table for VA 0x%08x", va); pte = (pt_entry_t *) kernel_pt_lookup(pde[L1_IDX(va)] & L1_C_ADDR_MASK); if (pte == NULL) panic("pmap_map_entry: can't find L2 table for VA 0x%08x", va); pte[l2pte_index(va)] = L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, prot) | fl; PTE_SYNC(&pte[l2pte_index(va)]); } /* * pmap_map_chunk: * * Map a chunk of memory using the most efficient mappings * possible (section. large page, small page) into the * provided L1 and L2 tables at the specified virtual address. */ vm_size_t pmap_map_chunk(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa, vm_size_t size, int prot, int cache) { pd_entry_t *pde = (pd_entry_t *) l1pt; pt_entry_t *pte, f1, f2s, f2l; vm_size_t resid; int i; resid = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1); if (l1pt == 0) panic("pmap_map_chunk: no L1 table provided"); #ifdef VERBOSE_INIT_ARM printf("pmap_map_chunk: pa=0x%x va=0x%x size=0x%x resid=0x%x " "prot=0x%x cache=%d\n", pa, va, size, resid, prot, cache); #endif switch (cache) { case PTE_NOCACHE: default: f1 = 0; f2l = 0; f2s = 0; break; case PTE_CACHE: f1 = pte_l1_s_cache_mode; f2l = pte_l2_l_cache_mode; f2s = pte_l2_s_cache_mode; break; case PTE_PAGETABLE: f1 = pte_l1_s_cache_mode_pt; f2l = pte_l2_l_cache_mode_pt; f2s = pte_l2_s_cache_mode_pt; break; } size = resid; while (resid > 0) { /* See if we can use a section mapping. */ if (L1_S_MAPPABLE_P(va, pa, resid)) { #ifdef VERBOSE_INIT_ARM printf("S"); #endif pde[va >> L1_S_SHIFT] = L1_S_PROTO | pa | L1_S_PROT(PTE_KERNEL, prot) | f1 | L1_S_DOM(PMAP_DOMAIN_KERNEL); PTE_SYNC(&pde[va >> L1_S_SHIFT]); va += L1_S_SIZE; pa += L1_S_SIZE; resid -= L1_S_SIZE; continue; } /* * Ok, we're going to use an L2 table. Make sure * one is actually in the corresponding L1 slot * for the current VA. */ if ((pde[va >> L1_S_SHIFT] & L1_TYPE_MASK) != L1_TYPE_C) panic("pmap_map_chunk: no L2 table for VA 0x%08x", va); pte = (pt_entry_t *) kernel_pt_lookup( pde[L1_IDX(va)] & L1_C_ADDR_MASK); if (pte == NULL) panic("pmap_map_chunk: can't find L2 table for VA" "0x%08x", va); /* See if we can use a L2 large page mapping. */ if (L2_L_MAPPABLE_P(va, pa, resid)) { #ifdef VERBOSE_INIT_ARM printf("L"); #endif for (i = 0; i < 16; i++) { pte[l2pte_index(va) + i] = L2_L_PROTO | pa | L2_L_PROT(PTE_KERNEL, prot) | f2l; PTE_SYNC(&pte[l2pte_index(va) + i]); } va += L2_L_SIZE; pa += L2_L_SIZE; resid -= L2_L_SIZE; continue; } /* Use a small page mapping. */ #ifdef VERBOSE_INIT_ARM printf("P"); #endif pte[l2pte_index(va)] = L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, prot) | f2s; PTE_SYNC(&pte[l2pte_index(va)]); va += PAGE_SIZE; pa += PAGE_SIZE; resid -= PAGE_SIZE; } #ifdef VERBOSE_INIT_ARM printf("\n"); #endif return (size); } /********************** Static device map routines ***************************/ static const struct pmap_devmap *pmap_devmap_table; /* * Register the devmap table. This is provided in case early console * initialization needs to register mappings created by bootstrap code * before pmap_devmap_bootstrap() is called. */ void pmap_devmap_register(const struct pmap_devmap *table) { pmap_devmap_table = table; } /* * Map all of the static regions in the devmap table, and remember * the devmap table so other parts of the kernel can look up entries * later. */ void pmap_devmap_bootstrap(vm_offset_t l1pt, const struct pmap_devmap *table) { int i; pmap_devmap_table = table; for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) { #ifdef VERBOSE_INIT_ARM printf("devmap: %08x -> %08x @ %08x\n", pmap_devmap_table[i].pd_pa, pmap_devmap_table[i].pd_pa + pmap_devmap_table[i].pd_size - 1, pmap_devmap_table[i].pd_va); #endif pmap_map_chunk(l1pt, pmap_devmap_table[i].pd_va, pmap_devmap_table[i].pd_pa, pmap_devmap_table[i].pd_size, pmap_devmap_table[i].pd_prot, pmap_devmap_table[i].pd_cache); } } const struct pmap_devmap * pmap_devmap_find_pa(vm_paddr_t pa, vm_size_t size) { int i; if (pmap_devmap_table == NULL) return (NULL); for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) { if (pa >= pmap_devmap_table[i].pd_pa && pa + size <= pmap_devmap_table[i].pd_pa + pmap_devmap_table[i].pd_size) return (&pmap_devmap_table[i]); } return (NULL); } const struct pmap_devmap * pmap_devmap_find_va(vm_offset_t va, vm_size_t size) { int i; if (pmap_devmap_table == NULL) return (NULL); for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) { if (va >= pmap_devmap_table[i].pd_va && va + size <= pmap_devmap_table[i].pd_va + pmap_devmap_table[i].pd_size) return (&pmap_devmap_table[i]); } return (NULL); } Index: head/sys/arm/at91/kb920x_machdep.c =================================================================== --- head/sys/arm/at91/kb920x_machdep.c (revision 173360) +++ head/sys/arm/at91/kb920x_machdep.c (revision 173361) @@ -1,495 +1,495 @@ /*- * Copyright (c) 1994-1998 Mark Brinicombe. * Copyright (c) 1994 Brini. * All rights reserved. * * This code is derived from software written for Brini by Mark Brinicombe * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Brini. * 4. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * RiscBSD kernel project * * machdep.c * * Machine dependant functions for kernel setup * * This file needs a lot of work. * * Created : 17/09/94 */ #include "opt_msgbuf.h" #include "opt_ddb.h" #include "opt_at91.h" #include __FBSDID("$FreeBSD$"); #define _ARM32_BUS_DMA_PRIVATE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KERNEL_PT_SYS 0 /* Page table for mapping proc0 zero page */ #define KERNEL_PT_KERN 1 #define KERNEL_PT_KERN_NUM 22 #define KERNEL_PT_AFKERNEL KERNEL_PT_KERN + KERNEL_PT_KERN_NUM /* L2 table for mapping after kernel */ #define KERNEL_PT_AFKERNEL_NUM 5 /* this should be evenly divisable by PAGE_SIZE / L2_TABLE_SIZE_REAL (or 4) */ #define NUM_KERNEL_PTS (KERNEL_PT_AFKERNEL + KERNEL_PT_AFKERNEL_NUM) /* Define various stack sizes in pages */ #define IRQ_STACK_SIZE 1 #define ABT_STACK_SIZE 1 #define UND_STACK_SIZE 1 extern u_int data_abort_handler_address; extern u_int prefetch_abort_handler_address; extern u_int undefined_handler_address; struct pv_addr kernel_pt_table[NUM_KERNEL_PTS]; extern void *_end; extern int *end; struct pcpu __pcpu; struct pcpu *pcpup = &__pcpu; /* Physical and virtual addresses for some global pages */ vm_paddr_t phys_avail[10]; vm_paddr_t dump_avail[4]; vm_offset_t physical_pages; struct pv_addr systempage; struct pv_addr msgbufpv; struct pv_addr irqstack; struct pv_addr undstack; struct pv_addr abtstack; struct pv_addr kernelstack; static struct trapframe proc0_tf; /* Static device mappings. */ static const struct pmap_devmap kb920x_devmap[] = { /* * Map the on-board devices VA == PA so that we can access them * with the MMU on or off. */ { /* * This at least maps the interrupt controller, the UART * and the timer. Other devices should use newbus to * map their memory anyway. */ 0xdff00000, 0xfff00000, 0x100000, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, /* * We can't just map the OHCI registers VA == PA, because * AT91RM92_OHCI_BASE belongs to the userland address space. * We could just choose a different virtual address, but a better * solution would probably be to just use pmap_mapdev() to allocate * KVA, as we don't need the OHCI controller before the vm * initialization is done. However, the AT91 resource allocation * system doesn't know how to use pmap_mapdev() yet. */ #if 1 { /* * Add the ohci controller, and anything else that might be * on this chip select for a VA/PA mapping. */ AT91RM92_OHCI_BASE, AT91RM92_OHCI_PA_BASE, AT91RM92_OHCI_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, #endif { 0, 0, 0, 0, 0, } }; #define SDRAM_START 0xa0000000 #ifdef DDB extern vm_offset_t ksym_start, ksym_end; #endif static long ramsize(void) { uint32_t *SDRAMC = (uint32_t *)(AT91RM92_BASE + AT91RM92_SDRAMC_BASE); uint32_t cr, mr; int banks, rows, cols, bw; cr = SDRAMC[AT91RM92_SDRAMC_CR / 4]; mr = SDRAMC[AT91RM92_SDRAMC_MR / 4]; bw = (mr & AT91RM92_SDRAMC_MR_DBW_16) ? 1 : 2; banks = (cr & AT91RM92_SDRAMC_CR_NB_4) ? 2 : 1; rows = ((cr & AT91RM92_SDRAMC_CR_NR_MASK) >> 2) + 11; cols = (cr & AT91RM92_SDRAMC_CR_NC_MASK) + 8; return (1 << (cols + rows + banks + bw)); } static long board_init(void) { /* * Since the USART supprots RS-485 multidrop mode, it allows the * TX pins to float. However, for RS-232 operations, we don't want * these pins to float. Instead, they should be pulled up to avoid * mismatches. Linux does something similar when it configures the * TX lines. This implies that we also allow the RX lines to float * rather than be in the state they are left in by the boot loader. * Since they are input pins, I think that this is the right thing * to do. */ /* PIOA's A periph: Turn USART 0 and 2's TX/RX pins */ at91_pio_use_periph_a(AT91RM92_PIOA_BASE, AT91C_PA18_RXD0 | AT91C_PA22_RXD2, 0); at91_pio_use_periph_a(AT91RM92_PIOA_BASE, AT91C_PA17_TXD0 | AT91C_PA23_TXD2, 1); /* PIOA's B periph: Turn USART 3's TX/RX pins */ at91_pio_use_periph_b(AT91RM92_PIOA_BASE, AT91C_PA6_RXD3, 0); at91_pio_use_periph_b(AT91RM92_PIOA_BASE, AT91C_PA5_TXD3, 1); #ifdef AT91_TSC /* We're using TC0's A1 and A2 input */ at91_pio_use_periph_b(AT91RM92_PIOA_BASE, AT91C_PA19_TIOA1 | AT91C_PA21_TIOA2, 0); #endif /* PIOB's A periph: Turn USART 1's TX/RX pins */ at91_pio_use_periph_a(AT91RM92_PIOB_BASE, AT91C_PB21_RXD1, 0); at91_pio_use_periph_a(AT91RM92_PIOB_BASE, AT91C_PB20_TXD1, 1); /* Pin assignment */ #ifdef AT91_TSC /* Assert PA24 low -- talk to rubidium */ at91_pio_use_gpio(AT91RM92_PIOA_BASE, AT91C_PIO_PA24); at91_pio_gpio_output(AT91RM92_PIOA_BASE, AT91C_PIO_PA24, 0); at91_pio_gpio_clear(AT91RM92_PIOA_BASE, AT91C_PIO_PA24); at91_pio_use_gpio(AT91RM92_PIOB_BASE, AT91C_PIO_PB16 | AT91C_PIO_PB17 | AT91C_PIO_PB18 | AT91C_PIO_PB19); #endif return (ramsize()); } void * initarm(void *arg, void *arg2) { struct pv_addr kernel_l1pt; int loop; u_int l1pagetable; vm_offset_t freemempos; vm_offset_t afterkern; int i; uint32_t fake_preload[35]; uint32_t memsize; vm_offset_t lastaddr; #ifdef DDB vm_offset_t zstart = 0, zend = 0; #endif i = 0; set_cpufuncs(); fake_preload[i++] = MODINFO_NAME; fake_preload[i++] = strlen("elf kernel") + 1; strcpy((char*)&fake_preload[i++], "elf kernel"); i += 2; fake_preload[i++] = MODINFO_TYPE; fake_preload[i++] = strlen("elf kernel") + 1; strcpy((char*)&fake_preload[i++], "elf kernel"); i += 2; fake_preload[i++] = MODINFO_ADDR; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = KERNVIRTADDR; fake_preload[i++] = MODINFO_SIZE; fake_preload[i++] = sizeof(uint32_t); fake_preload[i++] = (uint32_t)&end - KERNVIRTADDR; #ifdef DDB if (*(uint32_t *)KERNVIRTADDR == MAGIC_TRAMP_NUMBER) { fake_preload[i++] = MODINFO_METADATA|MODINFOMD_SSYM; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 4); fake_preload[i++] = MODINFO_METADATA|MODINFOMD_ESYM; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 8); lastaddr = *(uint32_t *)(KERNVIRTADDR + 8); zend = lastaddr; zstart = *(uint32_t *)(KERNVIRTADDR + 4); ksym_start = zstart; ksym_end = zend; } else #endif lastaddr = (vm_offset_t)&end; fake_preload[i++] = 0; fake_preload[i] = 0; preload_metadata = (void *)fake_preload; pcpu_init(pcpup, 0, sizeof(struct pcpu)); PCPU_SET(curthread, &thread0); #define KERNEL_TEXT_BASE (KERNBASE) freemempos = (lastaddr + PAGE_MASK) & ~PAGE_MASK; /* Define a macro to simplify memory allocation */ #define valloc_pages(var, np) \ alloc_pages((var).pv_va, (np)); \ (var).pv_pa = (var).pv_va + (KERNPHYSADDR - KERNVIRTADDR); #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += (np * PAGE_SIZE); \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); while (((freemempos - L1_TABLE_SIZE) & (L1_TABLE_SIZE - 1)) != 0) freemempos += PAGE_SIZE; valloc_pages(kernel_l1pt, L1_TABLE_SIZE / PAGE_SIZE); for (loop = 0; loop < NUM_KERNEL_PTS; ++loop) { if (!(loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL))) { valloc_pages(kernel_pt_table[loop], L2_TABLE_SIZE / PAGE_SIZE); } else { kernel_pt_table[loop].pv_va = freemempos - (loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL)) * L2_TABLE_SIZE_REAL; kernel_pt_table[loop].pv_pa = kernel_pt_table[loop].pv_va - KERNVIRTADDR + KERNPHYSADDR; } i++; } /* * Allocate a page for the system page mapped to V0x00000000 * This page will just contain the system vectors and can be * shared by all processes. */ valloc_pages(systempage, 1); /* Allocate stacks for all modes */ valloc_pages(irqstack, IRQ_STACK_SIZE); valloc_pages(abtstack, ABT_STACK_SIZE); valloc_pages(undstack, UND_STACK_SIZE); valloc_pages(kernelstack, KSTACK_PAGES); valloc_pages(msgbufpv, round_page(MSGBUF_SIZE) / PAGE_SIZE); /* * Now we start construction of the L1 page table * We start by mapping the L2 page tables into the L1. * This means that we can replace L1 mappings later on if necessary */ l1pagetable = kernel_l1pt.pv_va; /* Map the L2 pages tables in the L1 page table */ pmap_link_l2pt(l1pagetable, ARM_VECTORS_HIGH, &kernel_pt_table[KERNEL_PT_SYS]); for (i = 0; i < KERNEL_PT_KERN_NUM; i++) pmap_link_l2pt(l1pagetable, KERNBASE + i * 0x100000, &kernel_pt_table[KERNEL_PT_KERN + i]); pmap_map_chunk(l1pagetable, KERNBASE, PHYSADDR, (((uint32_t)(lastaddr) - KERNBASE) + PAGE_SIZE) & ~(PAGE_SIZE - 1), VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); afterkern = round_page((lastaddr + L1_S_SIZE) & ~(L1_S_SIZE - 1)); for (i = 0; i < KERNEL_PT_AFKERNEL_NUM; i++) { pmap_link_l2pt(l1pagetable, afterkern + i * 0x00100000, &kernel_pt_table[KERNEL_PT_AFKERNEL + i]); } /* Map the vector page. */ pmap_map_entry(l1pagetable, ARM_VECTORS_HIGH, systempage.pv_pa, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); /* Map the stack pages */ pmap_map_chunk(l1pagetable, irqstack.pv_va, irqstack.pv_pa, IRQ_STACK_SIZE * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_map_chunk(l1pagetable, abtstack.pv_va, abtstack.pv_pa, ABT_STACK_SIZE * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_map_chunk(l1pagetable, undstack.pv_va, undstack.pv_pa, UND_STACK_SIZE * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_map_chunk(l1pagetable, kernelstack.pv_va, kernelstack.pv_pa, KSTACK_PAGES * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_map_chunk(l1pagetable, kernel_l1pt.pv_va, kernel_l1pt.pv_pa, L1_TABLE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE); pmap_map_chunk(l1pagetable, msgbufpv.pv_va, msgbufpv.pv_pa, MSGBUF_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); for (loop = 0; loop < NUM_KERNEL_PTS; ++loop) { pmap_map_chunk(l1pagetable, kernel_pt_table[loop].pv_va, kernel_pt_table[loop].pv_pa, L2_TABLE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE); } pmap_devmap_bootstrap(l1pagetable, kb920x_devmap); cpu_domains((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT); setttb(kernel_l1pt.pv_pa); cpu_tlb_flushID(); cpu_domains(DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)); cninit(); memsize = board_init(); physmem = memsize / PAGE_SIZE; /* * Pages were allocated during the secondary bootstrap for the * stacks for different CPU modes. * We must now set the r13 registers in the different CPU modes to * point to these stacks. * Since the ARM stacks use STMFD etc. we must set r13 to the top end * of the stack memory. */ cpu_control(CPU_CONTROL_MMU_ENABLE, CPU_CONTROL_MMU_ENABLE); set_stackptr(PSR_IRQ32_MODE, irqstack.pv_va + IRQ_STACK_SIZE * PAGE_SIZE); set_stackptr(PSR_ABT32_MODE, abtstack.pv_va + ABT_STACK_SIZE * PAGE_SIZE); set_stackptr(PSR_UND32_MODE, undstack.pv_va + UND_STACK_SIZE * PAGE_SIZE); /* * We must now clean the cache again.... * Cleaning may be done by reading new data to displace any * dirty data in the cache. This will have happened in setttb() * but since we are boot strapping the addresses used for the read * may have just been remapped and thus the cache could be out * of sync. A re-clean after the switch will cure this. * After booting there are no gross reloations of the kernel thus * this problem will not occur after initarm(). */ cpu_idcache_wbinv_all(); /* Set stack for exception handlers */ data_abort_handler_address = (u_int)data_abort_handler; prefetch_abort_handler_address = (u_int)prefetch_abort_handler; undefined_handler_address = (u_int)undefinedinstruction_bounce; undefined_init(); - proc_linkup(&proc0, &thread0); + proc_linkup0(&proc0, &thread0); thread0.td_kstack = kernelstack.pv_va; thread0.td_pcb = (struct pcb *) (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; thread0.td_pcb->pcb_flags = 0; thread0.td_frame = &proc0_tf; pcpup->pc_curpcb = thread0.td_pcb; arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL); pmap_curmaxkvaddr = afterkern + 0x100000 * (KERNEL_PT_KERN_NUM - 1); /* * ARM_USE_SMALL_ALLOC uses dump_avail, so it must be filled before * calling pmap_bootstrap. */ dump_avail[0] = PHYSADDR; dump_avail[1] = PHYSADDR + memsize; dump_avail[2] = 0; dump_avail[3] = 0; pmap_bootstrap(freemempos, KERNVIRTADDR + 3 * memsize, &kernel_l1pt); msgbufp = (void*)msgbufpv.pv_va; msgbufinit(msgbufp, MSGBUF_SIZE); mutex_init(); i = 0; #if PHYSADDR != KERNPHYSADDR phys_avail[i++] = PHYSADDR; phys_avail[i++] = KERNPHYSADDR; #endif phys_avail[i++] = virtual_avail - KERNVIRTADDR + KERNPHYSADDR; phys_avail[i++] = PHYSADDR + memsize; phys_avail[i++] = 0; phys_avail[i++] = 0; /* Do basic tuning, hz etc */ init_param1(); init_param2(physmem); kdb_init(); return ((void *)(kernelstack.pv_va + USPACE_SVC_STACK_TOP - sizeof(struct pcb))); } Index: head/sys/arm/sa11x0/assabet_machdep.c =================================================================== --- head/sys/arm/sa11x0/assabet_machdep.c (revision 173360) +++ head/sys/arm/sa11x0/assabet_machdep.c (revision 173361) @@ -1,455 +1,455 @@ /* $NetBSD: hpc_machdep.c,v 1.70 2003/09/16 08:18:22 agc Exp $ */ /*- * Copyright (c) 1994-1998 Mark Brinicombe. * Copyright (c) 1994 Brini. * All rights reserved. * * This code is derived from software written for Brini by Mark Brinicombe * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Brini. * 4. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * RiscBSD kernel project * * machdep.c * * Machine dependant functions for kernel setup * * This file needs a lot of work. * * Created : 17/09/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_md.h" #define _ARM32_BUS_DMA_PRIVATE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MDROOT_ADDR 0xd0400000 #define KERNEL_PT_VMEM 0 /* Page table for mapping video memory */ #define KERNEL_PT_SYS 0 /* Page table for mapping proc0 zero page */ #define KERNEL_PT_IO 3 /* Page table for mapping IO */ #define KERNEL_PT_IRQ 2 /* Page table for mapping irq handler */ #define KERNEL_PT_KERNEL 1 /* Page table for mapping kernel */ #define KERNEL_PT_L1 4 /* Page table for mapping l1pt */ #define KERNEL_PT_VMDATA 5 /* Page tables for mapping kernel VM */ #define KERNEL_PT_VMDATA_NUM 7 /* start with 16MB of KVM */ #define NUM_KERNEL_PTS (KERNEL_PT_VMDATA + KERNEL_PT_VMDATA_NUM) /* Define various stack sizes in pages */ #define IRQ_STACK_SIZE 1 #define ABT_STACK_SIZE 1 #ifdef IPKDB #define UND_STACK_SIZE 2 #else #define UND_STACK_SIZE 1 #endif #define KERNEL_VM_BASE (KERNBASE + 0x00100000) #define KERNEL_VM_SIZE 0x05000000 extern u_int data_abort_handler_address; extern u_int prefetch_abort_handler_address; extern u_int undefined_handler_address; struct pv_addr kernel_pt_table[NUM_KERNEL_PTS]; extern void *_end; extern vm_offset_t sa1110_uart_vaddr; extern vm_offset_t sa1_cache_clean_addr; extern int *end; struct pcpu __pcpu; struct pcpu *pcpup = &__pcpu; #ifndef MD_ROOT_SIZE #define MD_ROOT_SIZE 65535 #endif /* Physical and virtual addresses for some global pages */ vm_paddr_t phys_avail[10]; vm_paddr_t dump_avail[4]; vm_paddr_t physical_start; vm_paddr_t physical_end; vm_paddr_t physical_freestart; vm_offset_t physical_pages; struct pv_addr systempage; struct pv_addr irqstack; struct pv_addr undstack; struct pv_addr abtstack; struct pv_addr kernelstack; static struct trapframe proc0_tf; /* Static device mappings. */ static const struct pmap_devmap assabet_devmap[] = { /* * Map the on-board devices VA == PA so that we can access them * with the MMU on or off. */ { SACOM1_VBASE, SACOM1_BASE, SACOM1_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, { SAIPIC_BASE, SAIPIC_BASE, SAIPIC_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, { 0, 0, 0, 0, 0, } }; struct arm32_dma_range * bus_dma_get_range(void) { return (NULL); } int bus_dma_get_range_nb(void) { return (0); } void cpu_reset() { cpu_halt(); while (1); } #define CPU_SA110_CACHE_CLEAN_SIZE (0x4000 * 2) void * initarm(void *arg, void *arg2) { struct pcpu *pc; struct pv_addr kernel_l1pt; struct pv_addr md_addr; struct pv_addr md_bla; int loop; u_int kerneldatasize, symbolsize; u_int l1pagetable; vm_offset_t freemempos; vm_offset_t lastalloced; vm_size_t pt_size; int i = 0; uint32_t fake_preload[35]; uint32_t memsize = 32 * 1024 * 1024; sa1110_uart_vaddr = SACOM1_VBASE; boothowto = RB_VERBOSE | RB_SINGLE; cninit(); set_cpufuncs(); fake_preload[i++] = MODINFO_NAME; fake_preload[i++] = strlen("elf kernel") + 1; strcpy((char*)&fake_preload[i++], "elf kernel"); i += 2; fake_preload[i++] = MODINFO_TYPE; fake_preload[i++] = strlen("elf kernel") + 1; strcpy((char*)&fake_preload[i++], "elf kernel"); i += 2; fake_preload[i++] = MODINFO_ADDR; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = KERNBASE; fake_preload[i++] = MODINFO_SIZE; fake_preload[i++] = sizeof(uint32_t); fake_preload[i++] = (uint32_t)&end - KERNBASE; fake_preload[i++] = MODINFO_NAME; fake_preload[i++] = strlen("md root") + 1; strcpy((char*)&fake_preload[i++], "md root"); i += 1; fake_preload[i++] = MODINFO_TYPE; fake_preload[i++] = strlen("md_image") + 1; strcpy((char*)&fake_preload[i++], "md_image"); i += 2; fake_preload[i++] = MODINFO_ADDR; fake_preload[i++] = sizeof(uint32_t); fake_preload[i++] = MDROOT_ADDR; fake_preload[i++] = MODINFO_SIZE; fake_preload[i++] = sizeof(uint32_t); fake_preload[i++] = MD_ROOT_SIZE * 1024; fake_preload[i++] = 0; fake_preload[i] = 0; preload_metadata = (void *)fake_preload; physmem = memsize / PAGE_SIZE; pc = &__pcpu; pcpu_init(pc, 0, sizeof(struct pcpu)); PCPU_SET(curthread, &thread0); physical_start = (vm_offset_t) KERNBASE; physical_end = (vm_offset_t) &end; physical_freestart = (((vm_offset_t)physical_end) + PAGE_MASK) & ~PAGE_MASK; md_addr.pv_va = md_addr.pv_pa = MDROOT_ADDR; #define KERNEL_TEXT_BASE (KERNBASE + 0x00040000) kerneldatasize = (u_int32_t)&end - (u_int32_t)KERNEL_TEXT_BASE; symbolsize = 0; freemempos = (vm_offset_t)round_page(physical_freestart); memset((void *)freemempos, 0, 256*1024); /* Define a macro to simplify memory allocation */ #define valloc_pages(var, np) \ alloc_pages((var).pv_pa, (np)); \ (var).pv_va = (var).pv_pa; #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += ((np) * PAGE_SIZE);\ memset((char *)(var), 0, ((np) * PAGE_SIZE)); while ((freemempos & (L1_TABLE_SIZE - 1)) != 0) freemempos += PAGE_SIZE; valloc_pages(kernel_l1pt, L1_TABLE_SIZE / PAGE_SIZE); valloc_pages(md_bla, L2_TABLE_SIZE / PAGE_SIZE); alloc_pages(sa1_cache_clean_addr, CPU_SA110_CACHE_CLEAN_SIZE / PAGE_SIZE); for (loop = 0; loop < NUM_KERNEL_PTS; ++loop) { if (!(loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL))) { valloc_pages(kernel_pt_table[loop], L2_TABLE_SIZE / PAGE_SIZE); } else { kernel_pt_table[loop].pv_pa = freemempos + (loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL)) * L2_TABLE_SIZE_REAL; kernel_pt_table[loop].pv_va = kernel_pt_table[loop].pv_pa; } } valloc_pages(systempage, 1); /* * Allocate a page for the system page mapped to V0x00000000 * This page will just contain the system vectors and can be * shared by all processes. */ pt_size = round_page(freemempos) - physical_freestart; /* Allocate stacks for all modes */ valloc_pages(irqstack, IRQ_STACK_SIZE); valloc_pages(abtstack, ABT_STACK_SIZE); valloc_pages(undstack, UND_STACK_SIZE); valloc_pages(kernelstack, KSTACK_PAGES); lastalloced = kernelstack.pv_va; /* * Allocate memory for the l1 and l2 page tables. The scheme to avoid * wasting memory by allocating the l1pt on the first 16k memory was * taken from NetBSD rpc_machdep.c. NKPT should be greater than 12 for * this to work (which is supposed to be the case). */ /* * Now we start construction of the L1 page table * We start by mapping the L2 page tables into the L1. * This means that we can replace L1 mappings later on if necessary */ l1pagetable = kernel_l1pt.pv_pa; /* Map the L2 pages tables in the L1 page table */ pmap_link_l2pt(l1pagetable, 0x00000000, &kernel_pt_table[KERNEL_PT_SYS]); pmap_link_l2pt(l1pagetable, KERNBASE, &kernel_pt_table[KERNEL_PT_KERNEL]); pmap_link_l2pt(l1pagetable, 0xd0000000, &kernel_pt_table[KERNEL_PT_IO]); pmap_link_l2pt(l1pagetable, lastalloced & ~((L1_S_SIZE * 4) - 1), &kernel_pt_table[KERNEL_PT_L1]); pmap_link_l2pt(l1pagetable, 0x90000000, &kernel_pt_table[KERNEL_PT_IRQ]); pmap_link_l2pt(l1pagetable, MDROOT_ADDR, &md_bla); for (loop = 0; loop < KERNEL_PT_VMDATA_NUM; ++loop) pmap_link_l2pt(l1pagetable, KERNEL_VM_BASE + loop * 0x00100000, &kernel_pt_table[KERNEL_PT_VMDATA + loop]); pmap_map_chunk(l1pagetable, KERNBASE, KERNBASE, ((uint32_t)&end - KERNBASE), VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); /* Map the stack pages */ pmap_map_chunk(l1pagetable, irqstack.pv_va, irqstack.pv_pa, IRQ_STACK_SIZE * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_map_chunk(l1pagetable, md_addr.pv_va, md_addr.pv_pa, MD_ROOT_SIZE * 1024, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_map_chunk(l1pagetable, abtstack.pv_va, abtstack.pv_pa, ABT_STACK_SIZE * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_map_chunk(l1pagetable, undstack.pv_va, undstack.pv_pa, UND_STACK_SIZE * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_map_chunk(l1pagetable, kernelstack.pv_va, kernelstack.pv_pa, KSTACK_PAGES * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_map_chunk(l1pagetable, kernel_l1pt.pv_va, kernel_l1pt.pv_pa, L1_TABLE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE); for (loop = 0; loop < NUM_KERNEL_PTS; ++loop) { pmap_map_chunk(l1pagetable, kernel_pt_table[loop].pv_va, kernel_pt_table[loop].pv_pa, L2_TABLE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE); } pmap_map_chunk(l1pagetable, md_bla.pv_va, md_bla.pv_pa, L2_TABLE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE); /* Map the vector page. */ pmap_map_entry(l1pagetable, vector_page, systempage.pv_pa, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); /* Map the statically mapped devices. */ pmap_devmap_bootstrap(l1pagetable, assabet_devmap); pmap_map_chunk(l1pagetable, sa1_cache_clean_addr, 0xf0000000, CPU_SA110_CACHE_CLEAN_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); data_abort_handler_address = (u_int)data_abort_handler; prefetch_abort_handler_address = (u_int)prefetch_abort_handler; undefined_handler_address = (u_int)undefinedinstruction_bounce; undefined_init(); cpu_domains((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT); setttb(kernel_l1pt.pv_pa); cpu_tlb_flushID(); cpu_domains(DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)); /* * Pages were allocated during the secondary bootstrap for the * stacks for different CPU modes. * We must now set the r13 registers in the different CPU modes to * point to these stacks. * Since the ARM stacks use STMFD etc. we must set r13 to the top end * of the stack memory. */ set_stackptr(PSR_IRQ32_MODE, irqstack.pv_va + IRQ_STACK_SIZE * PAGE_SIZE); set_stackptr(PSR_ABT32_MODE, abtstack.pv_va + ABT_STACK_SIZE * PAGE_SIZE); set_stackptr(PSR_UND32_MODE, undstack.pv_va + UND_STACK_SIZE * PAGE_SIZE); /* * We must now clean the cache again.... * Cleaning may be done by reading new data to displace any * dirty data in the cache. This will have happened in setttb() * but since we are boot strapping the addresses used for the read * may have just been remapped and thus the cache could be out * of sync. A re-clean after the switch will cure this. * After booting there are no gross reloations of the kernel thus * this problem will not occur after initarm(). */ cpu_idcache_wbinv_all(); bootverbose = 1; /* Set stack for exception handlers */ - proc_linkup(&proc0, &thread0); + proc_linkup0(&proc0, &thread0); thread0.td_kstack = kernelstack.pv_va; thread0.td_pcb = (struct pcb *) (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; thread0.td_pcb->pcb_flags = 0; thread0.td_frame = &proc0_tf; /* Enable MMU, I-cache, D-cache, write buffer. */ cpufunc_control(0x337f, 0x107d); arm_vector_init(ARM_VECTORS_LOW, ARM_VEC_ALL); pmap_curmaxkvaddr = freemempos + KERNEL_PT_VMDATA_NUM * 0x400000; dump_avail[0] = phys_avail[0] = round_page(virtual_avail); dump_avail[1] = phys_avail[1] = 0xc0000000 + 0x02000000 - 1; dump_avail[2] = phys_avail[2] = 0; dump_avail[3] = phys_avail[3] = 0; mutex_init(); pmap_bootstrap(freemempos, 0xd0000000, &kernel_l1pt); /* Do basic tuning, hz etc */ init_param1(); init_param2(physmem); kdb_init(); return ((void *)(kernelstack.pv_va + USPACE_SVC_STACK_TOP - sizeof(struct pcb))); } Index: head/sys/arm/xscale/i80321/ep80219_machdep.c =================================================================== --- head/sys/arm/xscale/i80321/ep80219_machdep.c (revision 173360) +++ head/sys/arm/xscale/i80321/ep80219_machdep.c (revision 173361) @@ -1,517 +1,517 @@ /* $NetBSD: hpc_machdep.c,v 1.70 2003/09/16 08:18:22 agc Exp $ */ /*- * Copyright (c) 1994-1998 Mark Brinicombe. * Copyright (c) 1994 Brini. * All rights reserved. * * This code is derived from software written for Brini by Mark Brinicombe * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Brini. * 4. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * RiscBSD kernel project * * machdep.c * * Machine dependant functions for kernel setup * * This file needs a lot of work. * * Created : 17/09/94 */ #include "opt_msgbuf.h" #include "opt_ddb.h" #include __FBSDID("$FreeBSD$"); #define _ARM32_BUS_DMA_PRIVATE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KERNEL_PT_SYS 0 /* Page table for mapping proc0 zero page */ #define KERNEL_PT_IOPXS 1 #define KERNEL_PT_BEFOREKERN 2 #define KERNEL_PT_AFKERNEL 3 /* L2 table for mapping after kernel */ #define KERNEL_PT_AFKERNEL_NUM 9 /* this should be evenly divisable by PAGE_SIZE / L2_TABLE_SIZE_REAL (or 4) */ #define NUM_KERNEL_PTS (KERNEL_PT_AFKERNEL + KERNEL_PT_AFKERNEL_NUM) /* Define various stack sizes in pages */ #define IRQ_STACK_SIZE 1 #define ABT_STACK_SIZE 1 #ifdef IPKDB #define UND_STACK_SIZE 2 #else #define UND_STACK_SIZE 1 #endif extern u_int data_abort_handler_address; extern u_int prefetch_abort_handler_address; extern u_int undefined_handler_address; struct pv_addr kernel_pt_table[NUM_KERNEL_PTS]; extern void *_end; extern int *end; struct pcpu __pcpu; struct pcpu *pcpup = &__pcpu; /* Physical and virtual addresses for some global pages */ vm_paddr_t phys_avail[10]; vm_paddr_t dump_avail[4]; vm_offset_t physical_pages; vm_offset_t clean_sva, clean_eva; struct pv_addr systempage; struct pv_addr msgbufpv; struct pv_addr irqstack; struct pv_addr undstack; struct pv_addr abtstack; struct pv_addr kernelstack; struct pv_addr minidataclean; static struct trapframe proc0_tf; /* #define IQ80321_OBIO_BASE 0xfe800000UL */ /* #define IQ80321_OBIO_SIZE 0x00100000UL */ /* Static device mappings. */ static const struct pmap_devmap ep80219_devmap[] = { /* * Map the on-board devices VA == PA so that we can access them * with the MMU on or off. */ { IQ80321_OBIO_BASE, IQ80321_OBIO_BASE, IQ80321_OBIO_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, { IQ80321_IOW_VBASE, VERDE_OUT_XLATE_IO_WIN0_BASE, VERDE_OUT_XLATE_IO_WIN_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, { IQ80321_80321_VBASE, VERDE_PMMR_BASE, VERDE_PMMR_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, { 0, 0, 0, 0, 0, } }; #ifdef DDB extern vm_offset_t ksym_start, ksym_end; #endif extern vm_offset_t xscale_cache_clean_addr; void * initarm(void *arg, void *arg2) { struct pv_addr kernel_l1pt; int loop; u_int l1pagetable; vm_offset_t freemempos; vm_offset_t freemem_pt; vm_offset_t afterkern; vm_offset_t freemem_after; vm_offset_t lastaddr; #ifdef DDB vm_offset_t zstart = 0, zend = 0; #endif int i = 0; uint32_t fake_preload[35]; uint32_t memsize, memstart; i = 0; set_cpufuncs(); fake_preload[i++] = MODINFO_NAME; fake_preload[i++] = strlen("elf kernel") + 1; strcpy((char*)&fake_preload[i++], "elf kernel"); i += 2; fake_preload[i++] = MODINFO_TYPE; fake_preload[i++] = strlen("elf kernel") + 1; strcpy((char*)&fake_preload[i++], "elf kernel"); i += 2; fake_preload[i++] = MODINFO_ADDR; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = KERNBASE + 0x00200000; fake_preload[i++] = MODINFO_SIZE; fake_preload[i++] = sizeof(uint32_t); fake_preload[i++] = (uint32_t)&end - KERNBASE - 0x00200000; #ifdef DDB if (*(uint32_t *)KERNVIRTADDR == MAGIC_TRAMP_NUMBER) { fake_preload[i++] = MODINFO_METADATA|MODINFOMD_SSYM; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 4); fake_preload[i++] = MODINFO_METADATA|MODINFOMD_ESYM; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 8); lastaddr = *(uint32_t *)(KERNVIRTADDR + 8); zend = lastaddr; zstart = *(uint32_t *)(KERNVIRTADDR + 4); ksym_start = zstart; ksym_end = zend; } else #endif lastaddr = (vm_offset_t)&end; fake_preload[i++] = 0; fake_preload[i] = 0; preload_metadata = (void *)fake_preload; pcpu_init(pcpup, 0, sizeof(struct pcpu)); PCPU_SET(curthread, &thread0); #define KERNEL_TEXT_BASE (KERNBASE + 0x00200000) freemempos = 0xa0200000; /* Define a macro to simplify memory allocation */ #define valloc_pages(var, np) \ alloc_pages((var).pv_pa, (np)); \ (var).pv_va = (var).pv_pa + 0x20000000; #define alloc_pages(var, np) \ freemempos -= (np * PAGE_SIZE); \ (var) = freemempos; \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); while (((freemempos - L1_TABLE_SIZE) & (L1_TABLE_SIZE - 1)) != 0) freemempos -= PAGE_SIZE; valloc_pages(kernel_l1pt, L1_TABLE_SIZE / PAGE_SIZE); for (loop = 0; loop < NUM_KERNEL_PTS; ++loop) { if (!(loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL))) { valloc_pages(kernel_pt_table[loop], L2_TABLE_SIZE / PAGE_SIZE); } else { kernel_pt_table[loop].pv_pa = freemempos + (loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL)) * L2_TABLE_SIZE_REAL; kernel_pt_table[loop].pv_va = kernel_pt_table[loop].pv_pa + 0x20000000; } i++; } freemem_pt = freemempos; freemempos = 0xa0100000; /* * Allocate a page for the system page mapped to V0x00000000 * This page will just contain the system vectors and can be * shared by all processes. */ valloc_pages(systempage, 1); /* Allocate stacks for all modes */ valloc_pages(irqstack, IRQ_STACK_SIZE); valloc_pages(abtstack, ABT_STACK_SIZE); valloc_pages(undstack, UND_STACK_SIZE); valloc_pages(kernelstack, KSTACK_PAGES); alloc_pages(minidataclean.pv_pa, 1); valloc_pages(msgbufpv, round_page(MSGBUF_SIZE) / PAGE_SIZE); #ifdef ARM_USE_SMALL_ALLOC freemempos -= PAGE_SIZE; freemem_pt = trunc_page(freemem_pt); freemem_after = freemempos - ((freemem_pt - 0xa0100000) / PAGE_SIZE) * sizeof(struct arm_small_page); arm_add_smallalloc_pages((void *)(freemem_after + 0x20000000) , (void *)0xc0100000, freemem_pt - 0xa0100000, 1); freemem_after -= ((freemem_after - 0xa0001000) / PAGE_SIZE) * sizeof(struct arm_small_page); arm_add_smallalloc_pages((void *)(freemem_after + 0x20000000), (void *)0xc0001000, trunc_page(freemem_after) - 0xa0001000, 0); freemempos = trunc_page(freemem_after); freemempos -= PAGE_SIZE; #endif /* * Allocate memory for the l1 and l2 page tables. The scheme to avoid * wasting memory by allocating the l1pt on the first 16k memory was * taken from NetBSD rpc_machdep.c. NKPT should be greater than 12 for * this to work (which is supposed to be the case). */ /* * Now we start construction of the L1 page table * We start by mapping the L2 page tables into the L1. * This means that we can replace L1 mappings later on if necessary */ l1pagetable = kernel_l1pt.pv_va; /* Map the L2 pages tables in the L1 page table */ pmap_link_l2pt(l1pagetable, ARM_VECTORS_HIGH & ~(0x00100000 - 1), &kernel_pt_table[KERNEL_PT_SYS]); pmap_link_l2pt(l1pagetable, IQ80321_IOPXS_VBASE, &kernel_pt_table[KERNEL_PT_IOPXS]); pmap_link_l2pt(l1pagetable, KERNBASE, &kernel_pt_table[KERNEL_PT_BEFOREKERN]); pmap_map_chunk(l1pagetable, KERNBASE, IQ80321_SDRAM_START, 0x100000, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_map_chunk(l1pagetable, KERNBASE + 0x100000, IQ80321_SDRAM_START + 0x100000, 0x100000, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE); pmap_map_chunk(l1pagetable, KERNBASE + 0x200000, IQ80321_SDRAM_START + 0x200000, (((uint32_t)(lastaddr) - KERNBASE - 0x200000) + L1_S_SIZE) & ~(L1_S_SIZE - 1), VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); freemem_after = ((int)lastaddr + PAGE_SIZE) & ~(PAGE_SIZE - 1); afterkern = round_page(((vm_offset_t)lastaddr + L1_S_SIZE) & ~(L1_S_SIZE - 1)); for (i = 0; i < KERNEL_PT_AFKERNEL_NUM; i++) { pmap_link_l2pt(l1pagetable, afterkern + i * 0x00100000, &kernel_pt_table[KERNEL_PT_AFKERNEL + i]); } pmap_map_entry(l1pagetable, afterkern, minidataclean.pv_pa, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); #ifdef ARM_USE_SMALL_ALLOC if ((freemem_after + 2 * PAGE_SIZE) <= afterkern) { arm_add_smallalloc_pages((void *)(freemem_after), (void*)(freemem_after + PAGE_SIZE), afterkern - (freemem_after + PAGE_SIZE), 0); } #endif /* Map the Mini-Data cache clean area. */ xscale_setup_minidata(l1pagetable, afterkern, minidataclean.pv_pa); /* Map the vector page. */ pmap_map_entry(l1pagetable, ARM_VECTORS_HIGH, systempage.pv_pa, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_devmap_bootstrap(l1pagetable, ep80219_devmap); /* * Give the XScale global cache clean code an appropriately * sized chunk of unmapped VA space starting at 0xff000000 * (our device mappings end before this address). */ xscale_cache_clean_addr = 0xff000000U; cpu_domains((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT); setttb(kernel_l1pt.pv_pa); cpu_tlb_flushID(); cpu_domains(DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)); /* * Pages were allocated during the secondary bootstrap for the * stacks for different CPU modes. * We must now set the r13 registers in the different CPU modes to * point to these stacks. * Since the ARM stacks use STMFD etc. we must set r13 to the top end * of the stack memory. */ set_stackptr(PSR_IRQ32_MODE, irqstack.pv_va + IRQ_STACK_SIZE * PAGE_SIZE); set_stackptr(PSR_ABT32_MODE, abtstack.pv_va + ABT_STACK_SIZE * PAGE_SIZE); set_stackptr(PSR_UND32_MODE, undstack.pv_va + UND_STACK_SIZE * PAGE_SIZE); /* * We must now clean the cache again.... * Cleaning may be done by reading new data to displace any * dirty data in the cache. This will have happened in setttb() * but since we are boot strapping the addresses used for the read * may have just been remapped and thus the cache could be out * of sync. A re-clean after the switch will cure this. * After booting there are no gross reloations of the kernel thus * this problem will not occur after initarm(). */ cpu_idcache_wbinv_all(); /* * Fetch the SDRAM start/size from the i80321 SDRAM configration * registers. */ i80321_calibrate_delay(); i80321_sdram_bounds(&obio_bs_tag, IQ80321_80321_VBASE + VERDE_MCU_BASE, &memstart, &memsize); physmem = memsize / PAGE_SIZE; cninit(); /* Set stack for exception handlers */ data_abort_handler_address = (u_int)data_abort_handler; prefetch_abort_handler_address = (u_int)prefetch_abort_handler; undefined_handler_address = (u_int)undefinedinstruction_bounce; undefined_init(); - proc_linkup(&proc0, &thread0); + proc_linkup0(&proc0, &thread0); thread0.td_kstack = kernelstack.pv_va; thread0.td_pcb = (struct pcb *) (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; thread0.td_pcb->pcb_flags = 0; thread0.td_frame = &proc0_tf; pcpup->pc_curpcb = thread0.td_pcb; /* Enable MMU, I-cache, D-cache, write buffer. */ arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL); pmap_curmaxkvaddr = afterkern + PAGE_SIZE; dump_avail[0] = 0xa0000000; dump_avail[1] = 0xa0000000 + memsize; dump_avail[2] = 0; dump_avail[3] = 0; pmap_bootstrap(pmap_curmaxkvaddr, 0xd0000000, &kernel_l1pt); msgbufp = (void*)msgbufpv.pv_va; msgbufinit(msgbufp, MSGBUF_SIZE); mutex_init(); i = 0; #ifdef ARM_USE_SMALL_ALLOC phys_avail[i++] = 0xa0000000; phys_avail[i++] = 0xa0001000; /* *XXX: Gross hack to get our * pages in the vm_page_array . */ #endif phys_avail[i++] = round_page(virtual_avail - KERNBASE + IQ80321_SDRAM_START); phys_avail[i++] = trunc_page(0xa0000000 + memsize - 1); phys_avail[i++] = 0; phys_avail[i] = 0; /* Do basic tuning, hz etc */ init_param1(); init_param2(physmem); kdb_init(); return ((void *)(kernelstack.pv_va + USPACE_SVC_STACK_TOP - sizeof(struct pcb))); } extern int machdep_pci_route_interrupt(device_t pcib, device_t dev, int pin) { int bus; int device; int func; uint32_t busno; struct i80321_pci_softc *sc = device_get_softc(pcib); bus = pci_get_bus(dev); device = pci_get_slot(dev); func = pci_get_function(dev); busno = bus_space_read_4(sc->sc_st, sc->sc_atu_sh, ATU_PCIXSR); busno = PCIXSR_BUSNO(busno); if (busno == 0xff) busno = 0; if (bus != busno) goto no_mapping; switch (device) { /* EP80219 PCI */ case 1: /* Ethernet i82555 10/100 */ printf("Device %d routed to irq %d\n", device, ICU_INT_XINT(0)); return (ICU_INT_XINT(0)); case 2: /* UART */ printf("Device %d routed to irq %d\n", device, ICU_INT_XINT(1)); return (ICU_INT_XINT(1)); case 3: /* * The S-ATA chips are behind the bridge, and all of * the S-ATA interrupts are wired together. */ printf("Device %d routed to irq %d\n", device, ICU_INT_XINT(2)); return (ICU_INT_XINT(2)); case 4: /* MINI-PIC_INT */ printf("Device %d routed to irq %d\n", device, ICU_INT_XINT(3)); return( ICU_INT_XINT(3)); default: no_mapping: printf("No mapping for %d/%d/%d/%c\n", bus, device, func, pin); } return (0); } Index: head/sys/arm/xscale/i80321/iq31244_machdep.c =================================================================== --- head/sys/arm/xscale/i80321/iq31244_machdep.c (revision 173360) +++ head/sys/arm/xscale/i80321/iq31244_machdep.c (revision 173361) @@ -1,533 +1,533 @@ /* $NetBSD: hpc_machdep.c,v 1.70 2003/09/16 08:18:22 agc Exp $ */ /*- * Copyright (c) 1994-1998 Mark Brinicombe. * Copyright (c) 1994 Brini. * All rights reserved. * * This code is derived from software written for Brini by Mark Brinicombe * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Brini. * 4. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * RiscBSD kernel project * * machdep.c * * Machine dependant functions for kernel setup * * This file needs a lot of work. * * Created : 17/09/94 */ #include "opt_msgbuf.h" #include "opt_ddb.h" #include __FBSDID("$FreeBSD$"); #define _ARM32_BUS_DMA_PRIVATE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KERNEL_PT_SYS 0 /* Page table for mapping proc0 zero page */ #define KERNEL_PT_IOPXS 1 #define KERNEL_PT_BEFOREKERN 2 #define KERNEL_PT_AFKERNEL 3 /* L2 table for mapping after kernel */ #define KERNEL_PT_AFKERNEL_NUM 9 /* this should be evenly divisable by PAGE_SIZE / L2_TABLE_SIZE_REAL (or 4) */ #define NUM_KERNEL_PTS (KERNEL_PT_AFKERNEL + KERNEL_PT_AFKERNEL_NUM) /* Define various stack sizes in pages */ #define IRQ_STACK_SIZE 1 #define ABT_STACK_SIZE 1 #ifdef IPKDB #define UND_STACK_SIZE 2 #else #define UND_STACK_SIZE 1 #endif extern u_int data_abort_handler_address; extern u_int prefetch_abort_handler_address; extern u_int undefined_handler_address; struct pv_addr kernel_pt_table[NUM_KERNEL_PTS]; extern void *_end; extern int *end; struct pcpu __pcpu; struct pcpu *pcpup = &__pcpu; /* Physical and virtual addresses for some global pages */ vm_paddr_t phys_avail[10]; vm_paddr_t dump_avail[4]; vm_offset_t physical_pages; vm_offset_t clean_sva, clean_eva; struct pv_addr systempage; struct pv_addr msgbufpv; struct pv_addr irqstack; struct pv_addr undstack; struct pv_addr abtstack; struct pv_addr kernelstack; struct pv_addr minidataclean; static struct trapframe proc0_tf; #define IQ80321_OBIO_BASE 0xfe800000UL #define IQ80321_OBIO_SIZE 0x00100000UL /* Static device mappings. */ static const struct pmap_devmap iq80321_devmap[] = { /* * Map the on-board devices VA == PA so that we can access them * with the MMU on or off. */ { IQ80321_OBIO_BASE, IQ80321_OBIO_BASE, IQ80321_OBIO_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, { IQ80321_IOW_VBASE, VERDE_OUT_XLATE_IO_WIN0_BASE, VERDE_OUT_XLATE_IO_WIN_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, { IQ80321_80321_VBASE, VERDE_PMMR_BASE, VERDE_PMMR_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, { 0, 0, 0, 0, 0, } }; #define SDRAM_START 0xa0000000 #ifdef DDB extern vm_offset_t ksym_start, ksym_end; #endif extern vm_offset_t xscale_cache_clean_addr; void * initarm(void *arg, void *arg2) { struct pv_addr kernel_l1pt; int loop; u_int l1pagetable; vm_offset_t freemempos; vm_offset_t freemem_pt; vm_offset_t afterkern; vm_offset_t freemem_after; vm_offset_t lastaddr; #ifdef DDB vm_offset_t zstart = 0, zend = 0; #endif int i; uint32_t fake_preload[35]; uint32_t memsize, memstart; i = 0; set_cpufuncs(); fake_preload[i++] = MODINFO_NAME; fake_preload[i++] = strlen("elf kernel") + 1; strcpy((char*)&fake_preload[i++], "elf kernel"); i += 2; fake_preload[i++] = MODINFO_TYPE; fake_preload[i++] = strlen("elf kernel") + 1; strcpy((char*)&fake_preload[i++], "elf kernel"); i += 2; fake_preload[i++] = MODINFO_ADDR; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = KERNBASE + 0x00200000; fake_preload[i++] = MODINFO_SIZE; fake_preload[i++] = sizeof(uint32_t); fake_preload[i++] = (uint32_t)&end - KERNBASE - 0x00200000; #ifdef DDB if (*(uint32_t *)KERNVIRTADDR == MAGIC_TRAMP_NUMBER) { fake_preload[i++] = MODINFO_METADATA|MODINFOMD_SSYM; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 4); fake_preload[i++] = MODINFO_METADATA|MODINFOMD_ESYM; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 8); lastaddr = *(uint32_t *)(KERNVIRTADDR + 8); zend = lastaddr; zstart = *(uint32_t *)(KERNVIRTADDR + 4); ksym_start = zstart; ksym_end = zend; } else #endif lastaddr = (vm_offset_t)&end; fake_preload[i++] = 0; fake_preload[i] = 0; preload_metadata = (void *)fake_preload; pcpu_init(pcpup, 0, sizeof(struct pcpu)); PCPU_SET(curthread, &thread0); #define KERNEL_TEXT_BASE (KERNBASE + 0x00200000) freemempos = 0xa0200000; /* Define a macro to simplify memory allocation */ #define valloc_pages(var, np) \ alloc_pages((var).pv_pa, (np)); \ (var).pv_va = (var).pv_pa + 0x20000000; #define alloc_pages(var, np) \ freemempos -= (np * PAGE_SIZE); \ (var) = freemempos; \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); while (((freemempos - L1_TABLE_SIZE) & (L1_TABLE_SIZE - 1)) != 0) freemempos -= PAGE_SIZE; valloc_pages(kernel_l1pt, L1_TABLE_SIZE / PAGE_SIZE); for (loop = 0; loop < NUM_KERNEL_PTS; ++loop) { if (!(loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL))) { valloc_pages(kernel_pt_table[loop], L2_TABLE_SIZE / PAGE_SIZE); } else { kernel_pt_table[loop].pv_pa = freemempos + (loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL)) * L2_TABLE_SIZE_REAL; kernel_pt_table[loop].pv_va = kernel_pt_table[loop].pv_pa + 0x20000000; } } freemem_pt = freemempos; freemempos = 0xa0100000; /* * Allocate a page for the system page mapped to V0x00000000 * This page will just contain the system vectors and can be * shared by all processes. */ valloc_pages(systempage, 1); /* Allocate stacks for all modes */ valloc_pages(irqstack, IRQ_STACK_SIZE); valloc_pages(abtstack, ABT_STACK_SIZE); valloc_pages(undstack, UND_STACK_SIZE); valloc_pages(kernelstack, KSTACK_PAGES); alloc_pages(minidataclean.pv_pa, 1); valloc_pages(msgbufpv, round_page(MSGBUF_SIZE) / PAGE_SIZE); #ifdef ARM_USE_SMALL_ALLOC freemempos -= PAGE_SIZE; freemem_pt = trunc_page(freemem_pt); freemem_after = freemempos - ((freemem_pt - 0xa0100000) / PAGE_SIZE) * sizeof(struct arm_small_page); arm_add_smallalloc_pages((void *)(freemem_after + 0x20000000) , (void *)0xc0100000, freemem_pt - 0xa0100000, 1); freemem_after -= ((freemem_after - 0xa0001000) / PAGE_SIZE) * sizeof(struct arm_small_page); arm_add_smallalloc_pages((void *)(freemem_after + 0x20000000) , (void *)0xc0001000, trunc_page(freemem_after) - 0xa0001000, 0); freemempos = trunc_page(freemem_after); freemempos -= PAGE_SIZE; #endif /* * Allocate memory for the l1 and l2 page tables. The scheme to avoid * wasting memory by allocating the l1pt on the first 16k memory was * taken from NetBSD rpc_machdep.c. NKPT should be greater than 12 for * this to work (which is supposed to be the case). */ /* * Now we start construction of the L1 page table * We start by mapping the L2 page tables into the L1. * This means that we can replace L1 mappings later on if necessary */ l1pagetable = kernel_l1pt.pv_va; /* Map the L2 pages tables in the L1 page table */ pmap_link_l2pt(l1pagetable, ARM_VECTORS_HIGH & ~(0x00100000 - 1), &kernel_pt_table[KERNEL_PT_SYS]); pmap_link_l2pt(l1pagetable, IQ80321_IOPXS_VBASE, &kernel_pt_table[KERNEL_PT_IOPXS]); pmap_link_l2pt(l1pagetable, KERNBASE, &kernel_pt_table[KERNEL_PT_BEFOREKERN]); pmap_map_chunk(l1pagetable, KERNBASE, SDRAM_START, 0x100000, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_map_chunk(l1pagetable, KERNBASE + 0x100000, SDRAM_START + 0x100000, 0x100000, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE); pmap_map_chunk(l1pagetable, KERNBASE + 0x200000, SDRAM_START + 0x200000, (((uint32_t)(lastaddr) - KERNBASE - 0x200000) + L1_S_SIZE) & ~(L1_S_SIZE - 1), VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); freemem_after = ((int)lastaddr + PAGE_SIZE) & ~(PAGE_SIZE - 1); afterkern = round_page(((vm_offset_t)lastaddr + L1_S_SIZE) & ~(L1_S_SIZE - 1)); for (i = 0; i < KERNEL_PT_AFKERNEL_NUM; i++) { pmap_link_l2pt(l1pagetable, afterkern + i * 0x00100000, &kernel_pt_table[KERNEL_PT_AFKERNEL + i]); } pmap_map_entry(l1pagetable, afterkern, minidataclean.pv_pa, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); #ifdef ARM_USE_SMALL_ALLOC if ((freemem_after + 2 * PAGE_SIZE) <= afterkern) { arm_add_smallalloc_pages((void *)(freemem_after), (void*)(freemem_after + PAGE_SIZE), afterkern - (freemem_after + PAGE_SIZE), 0); } #endif /* Map the Mini-Data cache clean area. */ xscale_setup_minidata(l1pagetable, afterkern, minidataclean.pv_pa); /* Map the vector page. */ pmap_map_entry(l1pagetable, ARM_VECTORS_HIGH, systempage.pv_pa, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_devmap_bootstrap(l1pagetable, iq80321_devmap); /* * Give the XScale global cache clean code an appropriately * sized chunk of unmapped VA space starting at 0xff000000 * (our device mappings end before this address). */ xscale_cache_clean_addr = 0xff000000U; cpu_domains((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT); setttb(kernel_l1pt.pv_pa); cpu_tlb_flushID(); cpu_domains(DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)); /* * Pages were allocated during the secondary bootstrap for the * stacks for different CPU modes. * We must now set the r13 registers in the different CPU modes to * point to these stacks. * Since the ARM stacks use STMFD etc. we must set r13 to the top end * of the stack memory. */ set_stackptr(PSR_IRQ32_MODE, irqstack.pv_va + IRQ_STACK_SIZE * PAGE_SIZE); set_stackptr(PSR_ABT32_MODE, abtstack.pv_va + ABT_STACK_SIZE * PAGE_SIZE); set_stackptr(PSR_UND32_MODE, undstack.pv_va + UND_STACK_SIZE * PAGE_SIZE); /* * We must now clean the cache again.... * Cleaning may be done by reading new data to displace any * dirty data in the cache. This will have happened in setttb() * but since we are boot strapping the addresses used for the read * may have just been remapped and thus the cache could be out * of sync. A re-clean after the switch will cure this. * After booting there are no gross reloations of the kernel thus * this problem will not occur after initarm(). */ cpu_idcache_wbinv_all(); /* * Fetch the SDRAM start/size from the i80321 SDRAM configration * registers. */ i80321_calibrate_delay(); i80321_sdram_bounds(&obio_bs_tag, IQ80321_80321_VBASE + VERDE_MCU_BASE, &memstart, &memsize); physmem = memsize / PAGE_SIZE; cninit(); /* Set stack for exception handlers */ data_abort_handler_address = (u_int)data_abort_handler; prefetch_abort_handler_address = (u_int)prefetch_abort_handler; undefined_handler_address = (u_int)undefinedinstruction_bounce; undefined_init(); - proc_linkup(&proc0, &thread0); + proc_linkup0(&proc0, &thread0); thread0.td_kstack = kernelstack.pv_va; thread0.td_pcb = (struct pcb *) (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; thread0.td_pcb->pcb_flags = 0; thread0.td_frame = &proc0_tf; pcpup->pc_curpcb = thread0.td_pcb; /* Enable MMU, I-cache, D-cache, write buffer. */ arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL); pmap_curmaxkvaddr = afterkern + PAGE_SIZE; /* * ARM_USE_SMALL_ALLOC uses dump_avail, so it must be filled before * calling pmap_bootstrap. */ dump_avail[0] = 0xa0000000; dump_avail[1] = 0xa0000000 + memsize; dump_avail[2] = 0; dump_avail[3] = 0; pmap_bootstrap(pmap_curmaxkvaddr, 0xd0000000, &kernel_l1pt); msgbufp = (void*)msgbufpv.pv_va; msgbufinit(msgbufp, MSGBUF_SIZE); mutex_init(); i = 0; #ifdef ARM_USE_SMALL_ALLOC phys_avail[i++] = 0xa0000000; phys_avail[i++] = 0xa0001000; /* *XXX: Gross hack to get our * pages in the vm_page_array . */ #endif phys_avail[i++] = round_page(virtual_avail - KERNBASE + SDRAM_START); phys_avail[i++] = trunc_page(0xa0000000 + memsize - 1); phys_avail[i++] = 0; phys_avail[i] = 0; /* Do basic tuning, hz etc */ init_param1(); init_param2(physmem); kdb_init(); return ((void *)(kernelstack.pv_va + USPACE_SVC_STACK_TOP - sizeof(struct pcb))); } extern int machdep_pci_route_interrupt(device_t pcib, device_t dev, int pin) { int bus; int device; int func; uint32_t busno; struct i80321_pci_softc *sc = device_get_softc(pcib); bus = pci_get_bus(dev); device = pci_get_slot(dev); func = pci_get_function(dev); busno = bus_space_read_4(sc->sc_st, sc->sc_atu_sh, ATU_PCIXSR); busno = PCIXSR_BUSNO(busno); if (busno == 0xff) busno = 0; if (bus != busno) goto no_mapping; switch (device) { /* IQ31244 PCI */ case 1: /* PCIX-PCIX bridge */ /* * The S-ATA chips are behind the bridge, and all of * the S-ATA interrupts are wired together. */ return (ICU_INT_XINT(2)); case 2: /* PCI slot */ /* All pins are wired together. */ return (ICU_INT_XINT(3)); case 3: /* i82546 dual Gig-E */ if (pin == 1 || pin == 2) return (ICU_INT_XINT(0)); goto no_mapping; /* IQ80321 PCI */ case 4: /* i82544 Gig-E */ case 8: /* * Apparently you can set the device for the ethernet adapter * to 8 with a jumper, so handle that as well */ if (pin == 1) return (ICU_INT_XINT(0)); goto no_mapping; case 6: /* S-PCI-X slot */ if (pin == 1) return (ICU_INT_XINT(2)); if (pin == 2) return (ICU_INT_XINT(3)); goto no_mapping; default: no_mapping: printf("No mapping for %d/%d/%d/%c\n", bus, device, func, pin); } return (0); } Index: head/sys/arm/xscale/i8134x/crb_machdep.c =================================================================== --- head/sys/arm/xscale/i8134x/crb_machdep.c (revision 173360) +++ head/sys/arm/xscale/i8134x/crb_machdep.c (revision 173361) @@ -1,458 +1,458 @@ /* $NetBSD: hpc_machdep.c,v 1.70 2003/09/16 08:18:22 agc Exp $ */ /*- * Copyright (c) 1994-1998 Mark Brinicombe. * Copyright (c) 1994 Brini. * All rights reserved. * * This code is derived from software written for Brini by Mark Brinicombe * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Brini. * 4. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * RiscBSD kernel project * * machdep.c * * Machine dependant functions for kernel setup * * This file needs a lot of work. * * Created : 17/09/94 */ #include "opt_msgbuf.h" #include "opt_ddb.h" #include __FBSDID("$FreeBSD$"); #define _ARM32_BUS_DMA_PRIVATE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* For i80321_calibrate_delay() */ #include #include #include #define KERNEL_PT_SYS 0 /* Page table for mapping proc0 zero page */ #define KERNEL_PT_IOPXS 1 #define KERNEL_PT_BEFOREKERN 2 #define KERNEL_PT_AFKERNEL 3 /* L2 table for mapping after kernel */ #define KERNEL_PT_AFKERNEL_NUM 9 /* this should be evenly divisable by PAGE_SIZE / L2_TABLE_SIZE_REAL (or 4) */ #define NUM_KERNEL_PTS (KERNEL_PT_AFKERNEL + KERNEL_PT_AFKERNEL_NUM) /* Define various stack sizes in pages */ #define IRQ_STACK_SIZE 1 #define ABT_STACK_SIZE 1 #ifdef IPKDB #define UND_STACK_SIZE 2 #else #define UND_STACK_SIZE 1 #endif extern u_int data_abort_handler_address; extern u_int prefetch_abort_handler_address; extern u_int undefined_handler_address; struct pv_addr kernel_pt_table[NUM_KERNEL_PTS]; extern void *_end; extern vm_offset_t sa1_cache_clean_addr; extern int *end; struct pcpu __pcpu; struct pcpu *pcpup = &__pcpu; /* Physical and virtual addresses for some global pages */ vm_paddr_t phys_avail[10]; vm_paddr_t dump_avail[4]; vm_offset_t physical_pages; vm_offset_t clean_sva, clean_eva; struct pv_addr systempage; struct pv_addr msgbufpv; struct pv_addr irqstack; struct pv_addr undstack; struct pv_addr abtstack; struct pv_addr kernelstack; static struct trapframe proc0_tf; /* Static device mappings. */ static const struct pmap_devmap iq81342_devmap[] = { { IOP34X_VADDR, IOP34X_HWADDR, IOP34X_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, { /* * Cheat and map a whole section, this will bring * both PCI-X and PCI-E outbound I/O */ IOP34X_PCIX_OIOBAR_VADDR &~ (0x100000 - 1), IOP34X_PCIX_OIOBAR &~ (0x100000 - 1), 0x100000, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, { IOP34X_PCE1_VADDR, IOP34X_PCE1, IOP34X_PCE1_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, { 0, 0, 0, 0, 0, } }; #define SDRAM_START 0x00000000 #ifdef DDB extern vm_offset_t ksym_start, ksym_end; #endif extern vm_offset_t xscale_cache_clean_addr; void * initarm(void *arg, void *arg2) { struct pv_addr kernel_l1pt; int loop; u_int l1pagetable; vm_offset_t freemempos; vm_offset_t freemem_pt; vm_offset_t afterkern; vm_offset_t freemem_after; vm_offset_t lastaddr; #ifdef DDB vm_offset_t zstart = 0, zend = 0; #endif int i; uint32_t fake_preload[35]; uint32_t memsize, memstart; i = 0; set_cpufuncs(); fake_preload[i++] = MODINFO_NAME; fake_preload[i++] = strlen("elf kernel") + 1; strcpy((char*)&fake_preload[i++], "elf kernel"); i += 2; fake_preload[i++] = MODINFO_TYPE; fake_preload[i++] = strlen("elf kernel") + 1; strcpy((char*)&fake_preload[i++], "elf kernel"); i += 2; fake_preload[i++] = MODINFO_ADDR; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = KERNBASE + 0x00200000; fake_preload[i++] = MODINFO_SIZE; fake_preload[i++] = sizeof(uint32_t); fake_preload[i++] = (uint32_t)&end - KERNBASE - 0x00200000; #ifdef DDB if (*(uint32_t *)KERNVIRTADDR == MAGIC_TRAMP_NUMBER) { fake_preload[i++] = MODINFO_METADATA|MODINFOMD_SSYM; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 4); fake_preload[i++] = MODINFO_METADATA|MODINFOMD_ESYM; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 8); lastaddr = *(uint32_t *)(KERNVIRTADDR + 8); zend = lastaddr; zstart = *(uint32_t *)(KERNVIRTADDR + 4); ksym_start = zstart; ksym_end = zend; } else #endif lastaddr = (vm_offset_t)&end; fake_preload[i++] = 0; fake_preload[i] = 0; preload_metadata = (void *)fake_preload; pcpu_init(pcpup, 0, sizeof(struct pcpu)); PCPU_SET(curthread, &thread0); #define KERNEL_TEXT_BASE (KERNBASE + 0x00200000) freemempos = 0x00200000; /* Define a macro to simplify memory allocation */ #define valloc_pages(var, np) \ alloc_pages((var).pv_pa, (np)); \ (var).pv_va = (var).pv_pa + 0xc0000000; #define alloc_pages(var, np) \ freemempos -= (np * PAGE_SIZE); \ (var) = freemempos; \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); while (((freemempos - L1_TABLE_SIZE) & (L1_TABLE_SIZE - 1)) != 0) freemempos -= PAGE_SIZE; valloc_pages(kernel_l1pt, L1_TABLE_SIZE / PAGE_SIZE); for (loop = 0; loop < NUM_KERNEL_PTS; ++loop) { if (!(loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL))) { valloc_pages(kernel_pt_table[loop], L2_TABLE_SIZE / PAGE_SIZE); } else { kernel_pt_table[loop].pv_pa = freemempos + (loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL)) * L2_TABLE_SIZE_REAL; kernel_pt_table[loop].pv_va = kernel_pt_table[loop].pv_pa + 0xc0000000; } } freemem_pt = freemempos; freemempos = 0x00100000; /* * Allocate a page for the system page mapped to V0x00000000 * This page will just contain the system vectors and can be * shared by all processes. */ valloc_pages(systempage, 1); /* Allocate stacks for all modes */ valloc_pages(irqstack, IRQ_STACK_SIZE); valloc_pages(abtstack, ABT_STACK_SIZE); valloc_pages(undstack, UND_STACK_SIZE); valloc_pages(kernelstack, KSTACK_PAGES); valloc_pages(msgbufpv, round_page(MSGBUF_SIZE) / PAGE_SIZE); #ifdef ARM_USE_SMALL_ALLOC freemempos -= PAGE_SIZE; freemem_pt = trunc_page(freemem_pt); freemem_after = freemempos - ((freemem_pt - 0x00100000) / PAGE_SIZE) * sizeof(struct arm_small_page); arm_add_smallalloc_pages((void *)(freemem_after + 0xc0000000) , (void *)0xc0100000, freemem_pt - 0x00100000, 1); freemem_after -= ((freemem_after - 0x00001000) / PAGE_SIZE) * sizeof(struct arm_small_page); #if 0 arm_add_smallalloc_pages((void *)(freemem_after + 0xc0000000) , (void *)0xc0001000, trunc_page(freemem_after) - 0x00001000, 0); #endif freemempos = trunc_page(freemem_after); freemempos -= PAGE_SIZE; #endif /* * Now we start construction of the L1 page table * We start by mapping the L2 page tables into the L1. * This means that we can replace L1 mappings later on if necessary */ l1pagetable = kernel_l1pt.pv_va; /* Map the L2 pages tables in the L1 page table */ pmap_link_l2pt(l1pagetable, ARM_VECTORS_HIGH & ~(0x00100000 - 1), &kernel_pt_table[KERNEL_PT_SYS]); pmap_map_chunk(l1pagetable, KERNBASE, SDRAM_START, 0x100000, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_map_chunk(l1pagetable, KERNBASE + 0x100000, SDRAM_START + 0x100000, 0x100000, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE); pmap_map_chunk(l1pagetable, KERNBASE + 0x200000, SDRAM_START + 0x200000, (((uint32_t)(lastaddr) - KERNBASE - 0x200000) + L1_S_SIZE) & ~(L1_S_SIZE - 1), VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); freemem_after = ((int)lastaddr + PAGE_SIZE) & ~(PAGE_SIZE - 1); afterkern = round_page(((vm_offset_t)lastaddr + L1_S_SIZE) & ~(L1_S_SIZE - 1)); for (i = 0; i < KERNEL_PT_AFKERNEL_NUM; i++) { pmap_link_l2pt(l1pagetable, afterkern + i * 0x00100000, &kernel_pt_table[KERNEL_PT_AFKERNEL + i]); } #ifdef ARM_USE_SMALL_ALLOC if ((freemem_after + 2 * PAGE_SIZE) <= afterkern) { arm_add_smallalloc_pages((void *)(freemem_after), (void*)(freemem_after + PAGE_SIZE), afterkern - (freemem_after + PAGE_SIZE), 0); } #endif /* Map the vector page. */ pmap_map_entry(l1pagetable, ARM_VECTORS_HIGH, systempage.pv_pa, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_devmap_bootstrap(l1pagetable, iq81342_devmap); /* * Give the XScale global cache clean code an appropriately * sized chunk of unmapped VA space starting at 0xff000000 * (our device mappings end before this address). */ xscale_cache_clean_addr = 0xff000000U; cpu_domains((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT); setttb(kernel_l1pt.pv_pa); cpu_tlb_flushID(); cpu_domains(DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)); /* * Pages were allocated during the secondary bootstrap for the * stacks for different CPU modes. * We must now set the r13 registers in the different CPU modes to * point to these stacks. * Since the ARM stacks use STMFD etc. we must set r13 to the top end * of the stack memory. */ set_stackptr(PSR_IRQ32_MODE, irqstack.pv_va + IRQ_STACK_SIZE * PAGE_SIZE); set_stackptr(PSR_ABT32_MODE, abtstack.pv_va + ABT_STACK_SIZE * PAGE_SIZE); set_stackptr(PSR_UND32_MODE, undstack.pv_va + UND_STACK_SIZE * PAGE_SIZE); /* * We must now clean the cache again.... * Cleaning may be done by reading new data to displace any * dirty data in the cache. This will have happened in setttb() * but since we are boot strapping the addresses used for the read * may have just been remapped and thus the cache could be out * of sync. A re-clean after the switch will cure this. * After booting there are no gross reloations of the kernel thus * this problem will not occur after initarm(). */ cpu_idcache_wbinv_all(); i80321_calibrate_delay(); i81342_sdram_bounds(&obio_bs_tag, IOP34X_VADDR, &memstart, &memsize); physmem = memsize / PAGE_SIZE; cninit(); /* Set stack for exception handlers */ data_abort_handler_address = (u_int)data_abort_handler; prefetch_abort_handler_address = (u_int)prefetch_abort_handler; undefined_handler_address = (u_int)undefinedinstruction_bounce; undefined_init(); #ifdef KSE proc_linkup(&proc0, &ksegrp0, &thread0); #else - proc_linkup(&proc0, &thread0); + proc_linkup0(&proc0, &thread0); #endif thread0.td_kstack = kernelstack.pv_va; thread0.td_pcb = (struct pcb *) (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; thread0.td_pcb->pcb_flags = 0; thread0.td_frame = &proc0_tf; pcpup->pc_curpcb = thread0.td_pcb; arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL); pmap_curmaxkvaddr = afterkern + PAGE_SIZE; /* * ARM_USE_SMALL_ALLOC uses dump_avail, so it must be filled before * calling pmap_bootstrap. */ dump_avail[0] = 0x00000000; dump_avail[1] = 0x00000000 + memsize; dump_avail[2] = 0; dump_avail[3] = 0; pmap_bootstrap(pmap_curmaxkvaddr, 0xd0000000, &kernel_l1pt); msgbufp = (void*)msgbufpv.pv_va; msgbufinit(msgbufp, MSGBUF_SIZE); mutex_init(); i = 0; #ifdef ARM_USE_SMALL_ALLOC phys_avail[i++] = 0x00000000; phys_avail[i++] = 0x00001000; /* *XXX: Gross hack to get our * pages in the vm_page_array . */ #endif phys_avail[i++] = round_page(virtual_avail - KERNBASE + SDRAM_START); phys_avail[i++] = trunc_page(0x00000000 + memsize - 1); phys_avail[i++] = 0; phys_avail[i] = 0; /* Do basic tuning, hz etc */ init_param1(); init_param2(physmem); kdb_init(); return ((void *)(kernelstack.pv_va + USPACE_SVC_STACK_TOP - sizeof(struct pcb))); } Index: head/sys/arm/xscale/ixp425/avila_machdep.c =================================================================== --- head/sys/arm/xscale/ixp425/avila_machdep.c (revision 173360) +++ head/sys/arm/xscale/ixp425/avila_machdep.c (revision 173361) @@ -1,538 +1,538 @@ /* $NetBSD: hpc_machdep.c,v 1.70 2003/09/16 08:18:22 agc Exp $ */ /*- * Copyright (c) 1994-1998 Mark Brinicombe. * Copyright (c) 1994 Brini. * All rights reserved. * * This code is derived from software written for Brini by Mark Brinicombe * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Brini. * 4. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * RiscBSD kernel project * * machdep.c * * Machine dependant functions for kernel setup * * This file needs a lot of work. * * Created : 17/09/94 */ #include "opt_msgbuf.h" #include "opt_ddb.h" #include __FBSDID("$FreeBSD$"); #define _ARM32_BUS_DMA_PRIVATE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KERNEL_PT_SYS 0 /* Page table for mapping proc0 zero page */ #define KERNEL_PT_IO 1 #define KERNEL_PT_IO_NUM 3 #define KERNEL_PT_BEFOREKERN KERNEL_PT_IO + KERNEL_PT_IO_NUM #define KERNEL_PT_AFKERNEL KERNEL_PT_BEFOREKERN + 1 /* L2 table for mapping after kernel */ #define KERNEL_PT_AFKERNEL_NUM 9 /* this should be evenly divisable by PAGE_SIZE / L2_TABLE_SIZE_REAL (or 4) */ #define NUM_KERNEL_PTS (KERNEL_PT_AFKERNEL + KERNEL_PT_AFKERNEL_NUM) /* Define various stack sizes in pages */ #define IRQ_STACK_SIZE 1 #define ABT_STACK_SIZE 1 #ifdef IPKDB #define UND_STACK_SIZE 2 #else #define UND_STACK_SIZE 1 #endif extern u_int data_abort_handler_address; extern u_int prefetch_abort_handler_address; extern u_int undefined_handler_address; struct pv_addr kernel_pt_table[NUM_KERNEL_PTS]; extern void *_end; extern int *end; struct pcpu __pcpu; struct pcpu *pcpup = &__pcpu; /* Physical and virtual addresses for some global pages */ vm_paddr_t phys_avail[10]; vm_paddr_t dump_avail[4]; vm_offset_t physical_pages; vm_offset_t clean_sva, clean_eva; struct pv_addr systempage; struct pv_addr msgbufpv; struct pv_addr irqstack; struct pv_addr undstack; struct pv_addr abtstack; struct pv_addr kernelstack; struct pv_addr minidataclean; static struct trapframe proc0_tf; /* Static device mappings. */ static const struct pmap_devmap ixp425_devmap[] = { /* Physical/Virtual address for I/O space */ { IXP425_IO_VBASE, IXP425_IO_HWBASE, IXP425_IO_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, /* Expansion Bus */ { IXP425_EXP_VBASE, IXP425_EXP_HWBASE, IXP425_EXP_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, /* IXP425 PCI Configuration */ { IXP425_PCI_VBASE, IXP425_PCI_HWBASE, IXP425_PCI_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, /* SDRAM Controller */ { IXP425_MCU_VBASE, IXP425_MCU_HWBASE, IXP425_MCU_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, /* PCI Memory Space */ { IXP425_PCI_MEM_VBASE, IXP425_PCI_MEM_HWBASE, IXP425_PCI_MEM_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, /* NPE-A Memory Space */ { IXP425_NPE_A_VBASE, IXP425_NPE_A_HWBASE, IXP425_NPE_A_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, /* NPE-B Memory Space */ { IXP425_NPE_B_VBASE, IXP425_NPE_B_HWBASE, IXP425_NPE_B_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, /* NPE-C Memory Space */ { IXP425_NPE_C_VBASE, IXP425_NPE_C_HWBASE, IXP425_NPE_C_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, /* MAC-A Memory Space */ { IXP425_MAC_A_VBASE, IXP425_MAC_A_HWBASE, IXP425_MAC_A_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, /* MAC-B Memory Space */ { IXP425_MAC_B_VBASE, IXP425_MAC_B_HWBASE, IXP425_MAC_B_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, /* Q-Mgr Memory Space */ { IXP425_QMGR_VBASE, IXP425_QMGR_HWBASE, IXP425_QMGR_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_NOCACHE, }, { 0, 0, 0, 0, 0, } }; #define SDRAM_START 0x10000000 #ifdef DDB extern vm_offset_t ksym_start, ksym_end; #endif extern vm_offset_t xscale_cache_clean_addr; void * initarm(void *arg, void *arg2) { struct pv_addr kernel_l1pt; int loop; u_int l1pagetable; vm_offset_t freemempos; vm_offset_t freemem_pt; vm_offset_t afterkern; vm_offset_t freemem_after; vm_offset_t lastaddr; #ifdef DDB vm_offset_t zstart = 0, zend = 0; #endif int i; uint32_t fake_preload[35]; uint32_t memsize; i = 0; set_cpufuncs(); fake_preload[i++] = MODINFO_NAME; fake_preload[i++] = strlen("elf kernel") + 1; strcpy((char*)&fake_preload[i++], "elf kernel"); i += 2; fake_preload[i++] = MODINFO_TYPE; fake_preload[i++] = strlen("elf kernel") + 1; strcpy((char*)&fake_preload[i++], "elf kernel"); i += 2; fake_preload[i++] = MODINFO_ADDR; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = KERNBASE + 0x00200000; fake_preload[i++] = MODINFO_SIZE; fake_preload[i++] = sizeof(uint32_t); fake_preload[i++] = (uint32_t)&end - KERNBASE - 0x00200000; #ifdef DDB if (*(uint32_t *)KERNVIRTADDR == MAGIC_TRAMP_NUMBER) { fake_preload[i++] = MODINFO_METADATA|MODINFOMD_SSYM; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 4); fake_preload[i++] = MODINFO_METADATA|MODINFOMD_ESYM; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 8); lastaddr = *(uint32_t *)(KERNVIRTADDR + 8); zend = lastaddr; zstart = *(uint32_t *)(KERNVIRTADDR + 4); ksym_start = zstart; ksym_end = zend; } else #endif lastaddr = (vm_offset_t)&end; fake_preload[i++] = 0; fake_preload[i] = 0; preload_metadata = (void *)fake_preload; pcpu_init(pcpup, 0, sizeof(struct pcpu)); PCPU_SET(curthread, &thread0); #define KERNEL_TEXT_BASE (KERNBASE + 0x00200000) freemempos = 0x10200000; /* Define a macro to simplify memory allocation */ #define valloc_pages(var, np) \ alloc_pages((var).pv_pa, (np)); \ (var).pv_va = (var).pv_pa + 0xb0000000; #define alloc_pages(var, np) \ freemempos -= (np * PAGE_SIZE); \ (var) = freemempos; \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); while (((freemempos - L1_TABLE_SIZE) & (L1_TABLE_SIZE - 1)) != 0) freemempos -= PAGE_SIZE; valloc_pages(kernel_l1pt, L1_TABLE_SIZE / PAGE_SIZE); for (loop = 0; loop < NUM_KERNEL_PTS; ++loop) { if (!(loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL))) { valloc_pages(kernel_pt_table[loop], L2_TABLE_SIZE / PAGE_SIZE); } else { kernel_pt_table[loop].pv_pa = freemempos + (loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL)) * L2_TABLE_SIZE_REAL; kernel_pt_table[loop].pv_va = kernel_pt_table[loop].pv_pa + 0xb0000000; } } freemem_pt = freemempos; freemempos = 0x10100000; /* * Allocate a page for the system page mapped to V0x00000000 * This page will just contain the system vectors and can be * shared by all processes. */ valloc_pages(systempage, 1); /* Allocate stacks for all modes */ valloc_pages(irqstack, IRQ_STACK_SIZE); valloc_pages(abtstack, ABT_STACK_SIZE); valloc_pages(undstack, UND_STACK_SIZE); valloc_pages(kernelstack, KSTACK_PAGES); alloc_pages(minidataclean.pv_pa, 1); valloc_pages(msgbufpv, round_page(MSGBUF_SIZE) / PAGE_SIZE); #ifdef ARM_USE_SMALL_ALLOC freemempos -= PAGE_SIZE; freemem_pt = trunc_page(freemem_pt); freemem_after = freemempos - ((freemem_pt - 0x10100000) / PAGE_SIZE) * sizeof(struct arm_small_page); arm_add_smallalloc_pages((void *)(freemem_after + 0xb0000000) , (void *)0xc0100000, freemem_pt - 0x10100000, 1); freemem_after -= ((freemem_after - 0x10001000) / PAGE_SIZE) * sizeof(struct arm_small_page); arm_add_smallalloc_pages((void *)(freemem_after + 0xb0000000) , (void *)0xc0001000, trunc_page(freemem_after) - 0x10001000, 0); freemempos = trunc_page(freemem_after); freemempos -= PAGE_SIZE; #endif /* * Allocate memory for the l1 and l2 page tables. The scheme to avoid * wasting memory by allocating the l1pt on the first 16k memory was * taken from NetBSD rpc_machdep.c. NKPT should be greater than 12 for * this to work (which is supposed to be the case). */ /* * Now we start construction of the L1 page table * We start by mapping the L2 page tables into the L1. * This means that we can replace L1 mappings later on if necessary */ l1pagetable = kernel_l1pt.pv_va; /* Map the L2 pages tables in the L1 page table */ pmap_link_l2pt(l1pagetable, ARM_VECTORS_HIGH & ~(0x00100000 - 1), &kernel_pt_table[KERNEL_PT_SYS]); pmap_link_l2pt(l1pagetable, IXP425_IO_VBASE, &kernel_pt_table[KERNEL_PT_IO]); pmap_link_l2pt(l1pagetable, IXP425_MCU_VBASE, &kernel_pt_table[KERNEL_PT_IO + 1]); pmap_link_l2pt(l1pagetable, IXP425_PCI_MEM_VBASE, &kernel_pt_table[KERNEL_PT_IO + 2]); pmap_link_l2pt(l1pagetable, KERNBASE, &kernel_pt_table[KERNEL_PT_BEFOREKERN]); pmap_map_chunk(l1pagetable, KERNBASE, SDRAM_START, 0x100000, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_map_chunk(l1pagetable, KERNBASE + 0x100000, SDRAM_START + 0x100000, 0x100000, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE); pmap_map_chunk(l1pagetable, KERNBASE + 0x200000, SDRAM_START + 0x200000, (((uint32_t)(lastaddr) - KERNBASE - 0x200000) + L1_S_SIZE) & ~(L1_S_SIZE - 1), VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); freemem_after = ((int)lastaddr + PAGE_SIZE) & ~(PAGE_SIZE - 1); afterkern = round_page(((vm_offset_t)lastaddr + L1_S_SIZE) & ~(L1_S_SIZE - 1)); for (i = 0; i < KERNEL_PT_AFKERNEL_NUM; i++) { pmap_link_l2pt(l1pagetable, afterkern + i * 0x00100000, &kernel_pt_table[KERNEL_PT_AFKERNEL + i]); } pmap_map_entry(l1pagetable, afterkern, minidataclean.pv_pa, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); #ifdef ARM_USE_SMALL_ALLOC if ((freemem_after + 2 * PAGE_SIZE) <= afterkern) { arm_add_smallalloc_pages((void *)(freemem_after), (void*)(freemem_after + PAGE_SIZE), afterkern - (freemem_after + PAGE_SIZE), 0); } #endif /* Map the Mini-Data cache clean area. */ xscale_setup_minidata(l1pagetable, afterkern, minidataclean.pv_pa); /* Map the vector page. */ pmap_map_entry(l1pagetable, ARM_VECTORS_HIGH, systempage.pv_pa, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE); pmap_devmap_bootstrap(l1pagetable, ixp425_devmap); /* * Give the XScale global cache clean code an appropriately * sized chunk of unmapped VA space starting at 0xff000000 * (our device mappings end before this address). */ xscale_cache_clean_addr = 0xff000000U; cpu_domains((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT); setttb(kernel_l1pt.pv_pa); cpu_tlb_flushID(); cpu_domains(DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)); /* * Pages were allocated during the secondary bootstrap for the * stacks for different CPU modes. * We must now set the r13 registers in the different CPU modes to * point to these stacks. * Since the ARM stacks use STMFD etc. we must set r13 to the top end * of the stack memory. */ set_stackptr(PSR_IRQ32_MODE, irqstack.pv_va + IRQ_STACK_SIZE * PAGE_SIZE); set_stackptr(PSR_ABT32_MODE, abtstack.pv_va + ABT_STACK_SIZE * PAGE_SIZE); set_stackptr(PSR_UND32_MODE, undstack.pv_va + UND_STACK_SIZE * PAGE_SIZE); /* * We must now clean the cache again.... * Cleaning may be done by reading new data to displace any * dirty data in the cache. This will have happened in setttb() * but since we are boot strapping the addresses used for the read * may have just been remapped and thus the cache could be out * of sync. A re-clean after the switch will cure this. * After booting there are no gross reloations of the kernel thus * this problem will not occur after initarm(). */ cpu_idcache_wbinv_all(); /* * Fetch the SDRAM start/size from the ixp425 SDRAM configration * registers. */ cninit(); memsize = ixp425_sdram_size(); physmem = memsize / PAGE_SIZE; /* Set stack for exception handlers */ data_abort_handler_address = (u_int)data_abort_handler; prefetch_abort_handler_address = (u_int)prefetch_abort_handler; undefined_handler_address = (u_int)undefinedinstruction_bounce; undefined_init(); - proc_linkup(&proc0, &thread0); + proc_linkup0(&proc0, &thread0); thread0.td_kstack = kernelstack.pv_va; thread0.td_pcb = (struct pcb *) (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; thread0.td_pcb->pcb_flags = 0; thread0.td_frame = &proc0_tf; pcpup->pc_curpcb = thread0.td_pcb; /* Enable MMU, I-cache, D-cache, write buffer. */ arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL); pmap_curmaxkvaddr = afterkern + PAGE_SIZE; dump_avail[0] = 0x10000000; dump_avail[1] = 0x10000000 + memsize; dump_avail[2] = 0; dump_avail[3] = 0; pmap_bootstrap(pmap_curmaxkvaddr, 0xd0000000, &kernel_l1pt); msgbufp = (void*)msgbufpv.pv_va; msgbufinit(msgbufp, MSGBUF_SIZE); mutex_init(); i = 0; #ifdef ARM_USE_SMALL_ALLOC phys_avail[i++] = 0x10000000; phys_avail[i++] = 0x10001000; /* *XXX: Gross hack to get our * pages in the vm_page_array . */ #endif phys_avail[i++] = round_page(virtual_avail - KERNBASE + SDRAM_START); phys_avail[i++] = trunc_page(0x10000000 + memsize - 1); phys_avail[i++] = 0; phys_avail[i] = 0; /* Do basic tuning, hz etc */ init_param1(); init_param2(physmem); kdb_init(); return ((void *)(kernelstack.pv_va + USPACE_SVC_STACK_TOP - sizeof(struct pcb))); } Index: head/sys/compat/pecoff/imgact_pecoff.c =================================================================== --- head/sys/compat/pecoff/imgact_pecoff.c (revision 173360) +++ head/sys/compat/pecoff/imgact_pecoff.c (revision 173361) @@ -1,602 +1,606 @@ /*- * Copyright (c) 2000 Masaru OKI * Copyright (c) 1994, 1995, 1998 Scott Bartram * Copyright (c) 1994 Adam Glass * Copyright (c) 1993, 1994 Christopher G. Demetriou * * originally from NetBSD kern/exec_ecoff.c * * Copyright (c) 2000 Takanori Watanabe * Copyright (c) 2000 KUROSAWA Takahiro * Copyright (c) 1995-1996 Sen Schmidt * Copyright (c) 1996 Peter Wemm * All rights reserved. * * originally from FreeBSD kern/imgact_elf.c * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Masaru OKI. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_pecoff.h" #define PECOFF_PE_SIGNATURE "PE\0\0" static int pecoff_fixup(register_t **, struct image_params *); #ifndef PECOFF_DEBUG #define DPRINTF(a) #else #define DPRINTF(a) printf a #endif static struct sysentvec pecoff_sysvec = { SYS_MAXSYSCALL, sysent, 0, 0, NULL, 0, NULL, NULL, pecoff_fixup, sendsig, sigcode, &szsigcode, 0, "FreeBSD PECoff", NULL, NULL, MINSIGSTKSZ, PAGE_SIZE, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK, PS_STRINGS, VM_PROT_ALL, exec_copyout_strings, exec_setregs, NULL }; static const char signature[] = PECOFF_PE_SIGNATURE; static int exec_pecoff_coff_prep_omagic(struct image_params *, struct coff_filehdr *, struct coff_aouthdr *, int peoffs); static int exec_pecoff_coff_prep_nmagic(struct image_params *, struct coff_filehdr *, struct coff_aouthdr *, int peoffs); static int exec_pecoff_coff_prep_zmagic(struct image_params *, struct coff_filehdr *, struct coff_aouthdr *, int peoffs); static int exec_pecoff_coff_makecmds(struct image_params *, struct coff_filehdr *, int); static int pecoff_signature(struct thread *, struct vnode *, const struct pecoff_dos_filehdr *); static int pecoff_read_from(struct thread *, struct vnode *, int, caddr_t, int); static int pecoff_load_section(struct thread * td, struct vmspace * vmspace, struct vnode * vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot); static int pecoff_fixup(register_t ** stack_base, struct image_params * imgp) { int len = sizeof(struct pecoff_args); struct pecoff_imghdr *ap; register_t *pos; pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2); ap = (struct pecoff_imghdr *) imgp->auxargs; if (copyout(ap, pos, len)) { return 0; } free(ap, M_TEMP); imgp->auxargs = NULL; (*stack_base)--; suword(*stack_base, (long) imgp->args->argc); return 0; } static int pecoff_load_section(struct thread * td, struct vmspace * vmspace, struct vnode * vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot) { size_t map_len; vm_offset_t map_addr; int error, rv; size_t copy_len; size_t copy_map_len; size_t copy_start; vm_object_t object; vm_offset_t copy_map_offset; vm_offset_t file_addr; vm_offset_t data_buf = 0; object = vp->v_object; error = 0; map_addr = trunc_page((vm_offset_t) vmaddr); file_addr = trunc_page(offset); DPRINTF(("SECARG:%x %p %x %x\n", offset, vmaddr, memsz, filsz)); if (file_addr != offset) { /* * The section is not on page boundary. We can't use * vm_map_insert(). Use copyin instead. */ map_len = round_page(memsz); copy_len = filsz; copy_map_offset = file_addr; copy_map_len = round_page(offset + filsz) - file_addr; copy_start = offset - file_addr; DPRINTF(("offset=%x vmaddr=%lx filsz=%x memsz=%x\n", offset, (long)vmaddr, filsz, memsz)); DPRINTF(("map_len=%x copy_len=%x copy_map_offset=%x" " copy_map_len=%x copy_start=%x\n", map_len, copy_len, copy_map_offset, copy_map_len, copy_start)); } else { map_len = trunc_page(filsz); if (map_len != 0) { vm_object_reference(object); vm_map_lock(&vmspace->vm_map); rv = vm_map_insert(&vmspace->vm_map, object, file_addr, /* file offset */ map_addr, /* virtual start */ map_addr + map_len, /* virtual end */ prot, VM_PROT_ALL, MAP_COPY_ON_WRITE | MAP_PREFAULT); vm_map_unlock(&vmspace->vm_map); if (rv != KERN_SUCCESS) { vm_object_deallocate(object); return EINVAL; } /* we can stop now if we've covered it all */ if (memsz == filsz) return 0; } copy_map_offset = trunc_page(offset + filsz); copy_map_len = PAGE_SIZE; copy_start = 0; copy_len = (offset + filsz) - trunc_page(offset + filsz); map_addr = trunc_page((vm_offset_t) vmaddr + filsz); map_len = round_page((vm_offset_t) vmaddr + memsz) - map_addr; } if (map_len != 0) { vm_map_lock(&vmspace->vm_map); rv = vm_map_insert(&vmspace->vm_map, NULL, 0, map_addr, map_addr + map_len, VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(&vmspace->vm_map); DPRINTF(("EMP-rv:%d,%x %x\n", rv, map_addr, map_addr + map_len)); if (rv != KERN_SUCCESS) { return EINVAL; } } DPRINTF(("COPYARG %x %x\n", map_addr, copy_len)); if (copy_len != 0) { vm_object_reference(object); rv = vm_map_find(exec_map, object, copy_map_offset, &data_buf, copy_map_len, TRUE, VM_PROT_READ, VM_PROT_ALL, MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL); if (rv != KERN_SUCCESS) { vm_object_deallocate(object); return EINVAL; } /* send the page fragment to user space */ error = copyout((caddr_t) data_buf + copy_start, (caddr_t) map_addr, copy_len); vm_map_remove(exec_map, data_buf, data_buf + copy_map_len); DPRINTF(("%d\n", error)); if (error) return (error); } /* * set it to the specified protection */ vm_map_protect(&vmspace->vm_map, map_addr, map_addr + map_len, prot, FALSE); return error; } static int pecoff_load_file(struct thread * td, const char *file, u_long * addr, u_long * entry, u_long * ldexport) { struct nameidata nd; struct pecoff_dos_filehdr dh; struct coff_filehdr *fp = 0; struct coff_aouthdr *ap; struct pecoff_opthdr *wp; struct coff_scnhdr *sh = 0; struct vmspace *vmspace = td->td_proc->p_vmspace; struct vattr attr; struct image_params image_params, *imgp; int peofs; int error, i, scnsiz; imgp = &image_params; /* * Initialize part of the common data */ imgp->proc = td->td_proc; imgp->execlabel = NULL; imgp->attr = &attr; imgp->firstpage = NULL; NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE, file, td); if ((error = namei(&nd)) != 0) { nd.ni_vp = NULL; goto fail; } NDFREE(&nd, NDF_ONLY_PNBUF); imgp->vp = nd.ni_vp; /* * Check permissions, modes, uid, etc on the file, and "open" it. */ error = exec_check_permissions(imgp); VOP_UNLOCK(nd.ni_vp, 0, td); if (error) goto fail; if ((error = pecoff_read_from(td, imgp->vp, 0, (caddr_t) & dh, sizeof(dh))) != 0) goto fail; if ((error = pecoff_signature(td, imgp->vp, &dh) != 0)) goto fail; fp = malloc(PECOFF_HDR_SIZE, M_TEMP, M_WAITOK); peofs = dh.d_peofs + sizeof(signature) - 1; if ((error = pecoff_read_from(td, imgp->vp, peofs, (caddr_t) fp, PECOFF_HDR_SIZE) != 0)) goto fail; if (COFF_BADMAG(fp)) { error = ENOEXEC; goto fail; } ap = (void *) ((char *) fp + sizeof(struct coff_filehdr)); wp = (void *) ((char *) ap + sizeof(struct coff_aouthdr)); /* read section header */ scnsiz = sizeof(struct coff_scnhdr) * fp->f_nscns; sh = malloc(scnsiz, M_TEMP, M_WAITOK); if ((error = pecoff_read_from(td, imgp->vp, peofs + PECOFF_HDR_SIZE, (caddr_t) sh, scnsiz)) != 0) goto fail; /* * Read Section infomation and map sections. */ for (i = 0; i < fp->f_nscns; i++) { int prot = 0; if (sh[i].s_flags & COFF_STYP_DISCARD) continue; /* XXX ? */ if ((sh[i].s_flags & COFF_STYP_TEXT) && (sh[i].s_flags & COFF_STYP_EXEC) == 0) continue; if ((sh[i].s_flags & (COFF_STYP_TEXT | COFF_STYP_DATA | COFF_STYP_BSS)) == 0) continue; prot |= (sh[i].s_flags & COFF_STYP_READ) ? VM_PROT_READ : 0; prot |= (sh[i].s_flags & COFF_STYP_WRITE) ? VM_PROT_WRITE : 0; prot |= (sh[i].s_flags & COFF_STYP_EXEC) ? VM_PROT_EXECUTE : 0; sh[i].s_vaddr += wp->w_base; /* RVA --> VA */ if ((error = pecoff_load_section(td, vmspace, imgp->vp, sh[i].s_scnptr ,(caddr_t) sh[i].s_vaddr, sh[i].s_paddr, sh[i].s_size ,prot)) != 0) goto fail; } *entry = wp->w_base + ap->a_entry; *addr = wp->w_base; *ldexport = wp->w_imghdr[0].i_vaddr + wp->w_base; fail: if (fp) free(fp, M_TEMP); if (sh) free(sh, M_TEMP); if (nd.ni_vp) vrele(nd.ni_vp); return error; } static int exec_pecoff_coff_prep_omagic(struct image_params * imgp, struct coff_filehdr * fp, struct coff_aouthdr * ap, int peofs) { return ENOEXEC; } static int exec_pecoff_coff_prep_nmagic(struct image_params * imgp, struct coff_filehdr * fp, struct coff_aouthdr * ap, int peofs) { return ENOEXEC; } static int exec_pecoff_coff_prep_zmagic(struct image_params * imgp, struct coff_filehdr * fp, struct coff_aouthdr * ap, int peofs) { int scnsiz = sizeof(struct coff_scnhdr) * fp->f_nscns; int error = ENOEXEC, i; int prot; u_long text_size = 0, data_size = 0, dsize; u_long text_addr = 0, data_addr = VM_MAXUSER_ADDRESS; u_long ldexport = 0, ldbase = 0; struct pecoff_opthdr *wp; struct coff_scnhdr *sh; struct vmspace *vmspace; struct pecoff_args *argp = NULL; sh = malloc(scnsiz, M_TEMP, M_WAITOK); wp = (void *) ((char *) ap + sizeof(struct coff_aouthdr)); error = pecoff_read_from(FIRST_THREAD_IN_PROC(imgp->proc), imgp->vp, peofs + PECOFF_HDR_SIZE, (caddr_t) sh, scnsiz); - exec_new_vmspace(imgp, &pecoff_sysvec); + if (error) + return (error); + error = exec_new_vmspace(imgp, &pecoff_sysvec); + if (error) + return (error); vmspace = imgp->proc->p_vmspace; for (i = 0; i < fp->f_nscns; i++) { prot = VM_PROT_WRITE; /* XXX for relocation? */ prot |= (sh[i].s_flags & COFF_STYP_READ) ? VM_PROT_READ : 0; prot |= (sh[i].s_flags & COFF_STYP_WRITE) ? VM_PROT_WRITE : 0; prot |= (sh[i].s_flags & COFF_STYP_EXEC) ? VM_PROT_EXECUTE : 0; sh[i].s_vaddr += wp->w_base; if (sh[i].s_flags & COFF_STYP_DISCARD) continue; if ((sh[i].s_flags & COFF_STYP_TEXT) != 0) { error = pecoff_load_section( FIRST_THREAD_IN_PROC(imgp->proc), vmspace, imgp->vp, sh[i].s_scnptr, (caddr_t) sh[i].s_vaddr, sh[i].s_paddr, sh[i].s_size ,prot); DPRINTF(("ERROR%d\n", error)); if (error) goto fail; text_addr = trunc_page(sh[i].s_vaddr); text_size = trunc_page(sh[i].s_size + sh[i].s_vaddr - text_addr); } if ((sh[i].s_flags & (COFF_STYP_DATA|COFF_STYP_BSS)) != 0) { if (pecoff_load_section( FIRST_THREAD_IN_PROC(imgp->proc), vmspace, imgp->vp, sh[i].s_scnptr, (caddr_t) sh[i].s_vaddr, sh[i].s_paddr, sh[i].s_size, prot) != 0) goto fail; data_addr = min(trunc_page(sh[i].s_vaddr), data_addr); dsize = round_page(sh[i].s_vaddr + sh[i].s_paddr) - data_addr; data_size = max(dsize, data_size); } } vmspace->vm_tsize = text_size >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t) (uintptr_t) text_addr; vmspace->vm_dsize = data_size >> PAGE_SHIFT; vmspace->vm_daddr = (caddr_t) (uintptr_t) data_addr; argp = malloc(sizeof(struct pecoff_args), M_TEMP, M_WAITOK); if (argp == NULL) { error = ENOMEM; goto fail; } argp->a_base = wp->w_base; argp->a_entry = wp->w_base + ap->a_entry; argp->a_end = data_addr + data_size; argp->a_subsystem = wp->w_subvers; error = pecoff_load_file(FIRST_THREAD_IN_PROC(imgp->proc), "/usr/libexec/ld.so.dll", &ldbase, &imgp->entry_addr, &ldexport); if (error) goto fail; argp->a_ldbase = ldbase; argp->a_ldexport = ldexport; memcpy(argp->a_imghdr, wp->w_imghdr, sizeof(struct pecoff_imghdr) * 16); for (i = 0; i < 16; i++) { argp->a_imghdr[i].i_vaddr += wp->w_base; } imgp->proc->p_sysent = &pecoff_sysvec; imgp->auxargs = argp; imgp->auxarg_size = sizeof(struct pecoff_args); imgp->interpreted = 0; if (sh != NULL) free(sh, M_TEMP); return 0; fail: error = (error) ? error : ENOEXEC; if (sh != NULL) free(sh, M_TEMP); if (argp != NULL) free(argp, M_TEMP); return error; } int exec_pecoff_coff_makecmds(struct image_params * imgp, struct coff_filehdr * fp, int peofs) { struct coff_aouthdr *ap; int error; if (COFF_BADMAG(fp)) { return ENOEXEC; } ap = (void *) ((char *) fp + sizeof(struct coff_filehdr)); switch (ap->a_magic) { case COFF_OMAGIC: error = exec_pecoff_coff_prep_omagic(imgp, fp, ap, peofs); break; case COFF_NMAGIC: error = exec_pecoff_coff_prep_nmagic(imgp, fp, ap, peofs); break; case COFF_ZMAGIC: error = exec_pecoff_coff_prep_zmagic(imgp, fp, ap, peofs); break; default: return ENOEXEC; } return error; } static int pecoff_signature(td, vp, dp) struct thread *td; struct vnode *vp; const struct pecoff_dos_filehdr *dp; { int error; char buf[512]; char *pesig; if (DOS_BADMAG(dp)) { return ENOEXEC; } error = pecoff_read_from(td, vp, dp->d_peofs, buf, sizeof(buf)); if (error) { return error; } pesig = buf; if (memcmp(pesig, signature, sizeof(signature) - 1) == 0) { return 0; } return EFTYPE; } int pecoff_read_from(td, vp, pos, buf, siz) struct thread *td; struct vnode *vp; int pos; caddr_t buf; int siz; { int error; size_t resid; error = vn_rdwr(UIO_READ, vp, buf, siz, pos, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) return error; if (resid != 0) { return ENOEXEC; } return 0; } static int imgact_pecoff(struct image_params * imgp) { const struct pecoff_dos_filehdr *dp = (const struct pecoff_dos_filehdr *) imgp->image_header; struct coff_filehdr *fp; int error, peofs; struct thread *td = curthread; error = pecoff_signature(FIRST_THREAD_IN_PROC(imgp->proc), imgp->vp, dp); if (error) { return -1; } VOP_UNLOCK(imgp->vp, 0, td); peofs = dp->d_peofs + sizeof(signature) - 1; fp = malloc(PECOFF_HDR_SIZE, M_TEMP, M_WAITOK); error = pecoff_read_from(FIRST_THREAD_IN_PROC(imgp->proc), imgp->vp, peofs, (caddr_t) fp, PECOFF_HDR_SIZE); if (error) goto fail; error = exec_pecoff_coff_makecmds(imgp, fp, peofs); fail: free(fp, M_TEMP); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); return error; } static struct execsw pecoff_execsw = {imgact_pecoff, "FreeBSD PEcoff"}; EXEC_SET(pecoff, pecoff_execsw); Index: head/sys/compat/svr4/imgact_svr4.c =================================================================== --- head/sys/compat/svr4/imgact_svr4.c (revision 173360) +++ head/sys/compat/svr4/imgact_svr4.c (revision 173361) @@ -1,241 +1,243 @@ /*- * Copyright (c) 1998 Mark Newton * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Based heavily on /sys/kern/imgact_aout.c which is: * Copyright (c) 1993, David Greenman * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int exec_svr4_imgact(struct image_params *iparams); static int exec_svr4_imgact(imgp) struct image_params *imgp; { const struct exec *a_out = (const struct exec *) imgp->image_header; struct vmspace *vmspace; vm_offset_t vmaddr; unsigned long virtual_offset, file_offset; vm_offset_t buffer; unsigned long bss_size; int error; struct thread *td = curthread; if (((a_out->a_magic >> 16) & 0xff) != 0x64) return -1; /* * Set file/virtual offset based on a.out variant. */ switch ((int)(a_out->a_magic & 0xffff)) { case 0413: virtual_offset = 0; file_offset = 1024; break; case 0314: virtual_offset = 4096; file_offset = 0; break; default: return (-1); } bss_size = round_page(a_out->a_bss); #ifdef DEBUG printf("imgact: text: %08lx, data: %08lx, bss: %08lx\n", (u_long)a_out->a_text, (u_long)a_out->a_data, bss_size); #endif /* * Check various fields in header for validity/bounds. */ if (a_out->a_entry < virtual_offset || a_out->a_entry >= virtual_offset + a_out->a_text || a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) return (-1); /* text + data can't exceed file size */ if (a_out->a_data + a_out->a_text > imgp->attr->va_size) return (EFAULT); /* * text/data/bss must not exceed limits */ PROC_LOCK(imgp->proc); if (a_out->a_text > maxtsiz || a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA)) { PROC_UNLOCK(imgp->proc); return (ENOMEM); } PROC_UNLOCK(imgp->proc); VOP_UNLOCK(imgp->vp, 0, td); /* * Destroy old process VM and create a new one (with a new stack) */ - exec_new_vmspace(imgp, &svr4_sysvec); + error = exec_new_vmspace(imgp, &svr4_sysvec); + if (error) + goto fail; vmspace = imgp->proc->p_vmspace; /* * Check if file_offset page aligned,. * Currently we cannot handle misalinged file offsets, * and so we read in the entire image (what a waste). */ if (file_offset & PAGE_MASK) { #ifdef DEBUG printf("imgact: Non page aligned binary %lu\n", file_offset); #endif /* * Map text+data+bss read/write/execute */ vmaddr = virtual_offset; error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, a_out->a_text + a_out->a_data + bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto fail; error = vm_mmap(kernel_map, &buffer, round_page(a_out->a_text + a_out->a_data + file_offset), VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, imgp->vp, trunc_page(file_offset)); if (error) goto fail; error = copyout((caddr_t)(buffer + file_offset), (caddr_t)vmaddr, a_out->a_text + a_out->a_data); vm_map_remove(kernel_map, buffer, buffer + round_page(a_out->a_text + a_out->a_data + file_offset)); if (error) goto fail; /* * remove write enable on the 'text' part */ error = vm_map_protect(&vmspace->vm_map, vmaddr, vmaddr + a_out->a_text, VM_PROT_EXECUTE|VM_PROT_READ, TRUE); if (error) goto fail; } else { #ifdef DEBUG printf("imgact: Page aligned binary %lu\n", file_offset); #endif /* * Map text+data read/execute */ vmaddr = virtual_offset; error = vm_mmap(&vmspace->vm_map, &vmaddr, a_out->a_text + a_out->a_data, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, imgp->vp, file_offset); if (error) goto fail; #ifdef DEBUG printf("imgact: startaddr=%08lx, length=%08lx\n", (u_long)vmaddr, (u_long)a_out->a_text + a_out->a_data); #endif /* * allow read/write of data */ error = vm_map_protect(&vmspace->vm_map, vmaddr + a_out->a_text, vmaddr + a_out->a_text + a_out->a_data, VM_PROT_ALL, FALSE); if (error) goto fail; /* * Allocate anon demand-zeroed area for uninitialized data */ if (bss_size != 0) { vmaddr = virtual_offset + a_out->a_text + a_out->a_data; error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto fail; #ifdef DEBUG printf("imgact: bssaddr=%08lx, length=%08lx\n", (u_long)vmaddr, bss_size); #endif } } /* Fill in process VM information */ vmspace->vm_tsize = round_page(a_out->a_text) >> PAGE_SHIFT; vmspace->vm_dsize = round_page(a_out->a_data + bss_size) >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t)virtual_offset; vmspace->vm_daddr = (caddr_t)virtual_offset + a_out->a_text; /* Fill in image_params */ imgp->interpreted = 0; imgp->entry_addr = a_out->a_entry; imgp->proc->p_sysent = &svr4_sysvec; fail: vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); return (error); } /* * Tell kern_execve.c about it, with a little help from the linker. */ struct execsw svr4_execsw = { exec_svr4_imgact, "svr4 ELF" }; EXEC_SET(execsw_set, svr4_execsw); Index: head/sys/i386/i386/machdep.c =================================================================== --- head/sys/i386/i386/machdep.c (revision 173360) +++ head/sys/i386/i386/machdep.c (revision 173361) @@ -1,3094 +1,3094 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_apic.h" #include "opt_atalk.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_ipx.h" #include "opt_isa.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_msgbuf.h" #include "opt_npx.h" #include "opt_perfmon.h" #include "opt_xbox.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #ifndef KDB #error KDB must be enabled in order for DDB to work! #endif #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PERFMON #include #endif #ifdef SMP #include #include #endif #ifdef DEV_ISA #include #endif #ifdef XBOX #include int arch_i386_is_xbox = 0; uint32_t arch_i386_xbox_memsize = 0; #endif /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); extern void init386(int first); extern void dblfault_handler(void); extern void printcpuinfo(void); /* XXX header file */ extern void finishidentcpu(void); extern void panicifcpuunsupported(void); extern void initializecpu(void); #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) #define CPU_ENABLE_SSE #endif static void cpu_startup(void *); static void fpstate_drop(struct thread *td); static void get_fpcontext(struct thread *td, mcontext_t *mcp); static int set_fpcontext(struct thread *td, const mcontext_t *mcp); #ifdef CPU_ENABLE_SSE static void set_fpregs_xmm(struct save87 *, struct savexmm *); static void fill_fpregs_xmm(struct savexmm *, struct save87 *); #endif /* CPU_ENABLE_SSE */ SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) #ifdef DDB extern vm_offset_t ksym_start, ksym_end; #endif int _udatasel, _ucodesel; u_int basemem; int cold = 1; #ifdef COMPAT_43 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); #endif #ifdef COMPAT_FREEBSD4 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); #endif long Maxmem = 0; long realmem = 0; /* * The number of PHYSMAP entries must be one less than the number of * PHYSSEG entries because the PHYSMAP entry that spans the largest * physical address that is accessible by ISA DMA is split into two * PHYSSEG entries. */ #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2) #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2) struct kva_md_info kmi; static struct trapframe proc0_tf; #ifndef SMP static struct pcpu __pcpu; #endif struct mtx icu_lock; struct mem_range_softc mem_range_softc; static void cpu_startup(dummy) void *dummy; { /* * Good {morning,afternoon,evening,night}. */ startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif printf("real memory = %ju (%ju MB)\n", ptoa((uintmax_t)Maxmem), ptoa((uintmax_t)Maxmem) / 1048576); realmem = Maxmem; /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size; size = phys_avail[indx + 1] - phys_avail[indx]; printf( "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", (uintmax_t)phys_avail[indx], (uintmax_t)phys_avail[indx + 1] - 1, (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", ptoa((uintmax_t)cnt.v_free_count), ptoa((uintmax_t)cnt.v_free_count) / 1048576); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); cpu_setregs(); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ #ifdef COMPAT_43 static void osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct osigframe sf, *fp; struct proc *p; struct thread *td; struct sigacts *psp; struct trapframe *regs; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct osigframe *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct osigframe)); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else fp = (struct osigframe *)regs->tf_esp - 1; /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_arg2 = (register_t)&fp->sf_siginfo; sf.sf_siginfo.si_signo = sig; sf.sf_siginfo.si_code = ksi->ksi_code; sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; } else { /* Old FreeBSD-style arguments. */ sf.sf_arg2 = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Save most if not all of trap frame. */ sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; sf.sf_siginfo.si_sc.sc_es = regs->tf_es; sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; sf.sf_siginfo.si_sc.sc_gs = rgs(); sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; /* Build the signal context to be used by osigreturn(). */ sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; sf.sf_siginfo.si_sc.sc_err = regs->tf_err; /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_siginfo.si_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* See sendsig() for comments. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(*fp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_esp = (int)fp; regs->tf_eip = PS_STRINGS - szosigcode; regs->tf_eflags &= ~PSL_T; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; load_gs(_udatasel); regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #endif /* COMPAT_43 */ #ifdef COMPAT_FREEBSD4 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe4 sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; struct trapframe *regs; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_gs = rgs(); bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct sigframe4)); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sfp = (struct sigframe4 *)regs->tf_esp - 1; /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (register_t)&sfp->sf_si; sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si.si_signo = sig; sf.sf_si.si_code = ksi->ksi_code; sf.sf_si.si_addr = ksi->ksi_addr; } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_uc.uc_mcontext.mc_eflags = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_esp = (int)sfp; regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode; regs->tf_eflags &= ~PSL_T; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #endif /* COMPAT_FREEBSD4 */ void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; char *sp; struct trapframe *regs; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); #ifdef COMPAT_FREEBSD4 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { freebsd4_sendsig(catcher, ksi, mask); return; } #endif #ifdef COMPAT_43 if (SIGISMEMBER(psp->ps_osigset, sig)) { osendsig(catcher, ksi, mask); return; } #endif regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_gs = rgs(); bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ get_fpcontext(td, &sf.sf_uc.uc_mcontext); fpstate_drop(td); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct sigframe); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sp = (char *)regs->tf_esp - sizeof(struct sigframe); /* Align to 16 bytes. */ sfp = (struct sigframe *)((unsigned int)sp & ~0xF); /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (register_t)&sfp->sf_si; sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_uc.uc_mcontext.mc_eflags = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_esp = (int)sfp; regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); regs->tf_eflags &= ~PSL_T; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. * * MPSAFE */ #ifdef COMPAT_43 int osigreturn(td, uap) struct thread *td; struct osigreturn_args /* { struct osigcontext *sigcntxp; } */ *uap; { struct osigcontext sc; struct trapframe *regs; struct osigcontext *scp; struct proc *p = td->td_proc; int eflags, error; ksiginfo_t ksi; regs = td->td_frame; error = copyin(uap->sigcntxp, &sc, sizeof(sc)); if (error != 0) return (error); scp = ≻ eflags = scp->sc_ps; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (td->td_pcb->pcb_ext == 0) return (EINVAL); vm86 = &td->td_pcb->pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); } if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } tf->tf_vm86_ds = scp->sc_ds; tf->tf_vm86_es = scp->sc_es; tf->tf_vm86_fs = scp->sc_fs; tf->tf_vm86_gs = scp->sc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ if (!CS_SECURE(scp->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } regs->tf_ds = scp->sc_ds; regs->tf_es = scp->sc_es; regs->tf_fs = scp->sc_fs; } /* Restore remaining registers. */ regs->tf_eax = scp->sc_eax; regs->tf_ebx = scp->sc_ebx; regs->tf_ecx = scp->sc_ecx; regs->tf_edx = scp->sc_edx; regs->tf_esi = scp->sc_esi; regs->tf_edi = scp->sc_edi; regs->tf_cs = scp->sc_cs; regs->tf_ss = scp->sc_ss; regs->tf_isp = scp->sc_isp; regs->tf_ebp = scp->sc_fp; regs->tf_esp = scp->sc_sp; regs->tf_eip = scp->sc_pc; regs->tf_eflags = eflags; PROC_LOCK(p); #if defined(COMPAT_43) if (scp->sc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif SIGSETOLD(td->td_sigmask, scp->sc_mask); SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); return (EJUSTRETURN); } #endif /* COMPAT_43 */ #ifdef COMPAT_FREEBSD4 /* * MPSAFE */ int freebsd4_sigreturn(td, uap) struct thread *td; struct freebsd4_sigreturn_args /* { const ucontext4 *sigcntxp; } */ *uap; { struct ucontext4 uc; struct proc *p = td->td_proc; struct trapframe *regs; const struct ucontext4 *ucp; int cs, eflags, error; ksiginfo_t ksi; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) return (error); ucp = &uc; regs = td->td_frame; eflags = ucp->uc_mcontext.mc_eflags; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (td->td_pcb->pcb_ext == 0) return (EINVAL); vm86 = &td->td_pcb->pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); } if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); tf->tf_eflags = eflags; tf->tf_vm86_ds = tf->tf_ds; tf->tf_vm86_es = tf->tf_es; tf->tf_vm86_fs = tf->tf_fs; tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { printf("freebsd4_sigreturn: eflags = 0x%x\n", eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { printf("freebsd4_sigreturn: cs = 0x%x\n", cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); } PROC_LOCK(p); #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif td->td_sigmask = ucp->uc_sigmask; SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); return (EJUSTRETURN); } #endif /* COMPAT_FREEBSD4 */ /* * MPSAFE */ int sigreturn(td, uap) struct thread *td; struct sigreturn_args /* { const struct __ucontext *sigcntxp; } */ *uap; { ucontext_t uc; struct proc *p = td->td_proc; struct trapframe *regs; const ucontext_t *ucp; int cs, eflags, error, ret; ksiginfo_t ksi; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) return (error); ucp = &uc; regs = td->td_frame; eflags = ucp->uc_mcontext.mc_eflags; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (td->td_pcb->pcb_ext == 0) return (EINVAL); vm86 = &td->td_pcb->pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); } if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); tf->tf_eflags = eflags; tf->tf_vm86_ds = tf->tf_ds; tf->tf_vm86_es = tf->tf_es; tf->tf_vm86_fs = tf->tf_fs; tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { printf("sigreturn: eflags = 0x%x\n", eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { printf("sigreturn: cs = 0x%x\n", cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } ret = set_fpcontext(td, &ucp->uc_mcontext); if (ret != 0) return (ret); bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); } PROC_LOCK(p); #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif td->td_sigmask = ucp->uc_sigmask; SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); return (EJUSTRETURN); } /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { register_t reg; uint64_t tsc1, tsc2; if (pcpu_find(cpu_id) == NULL || rate == NULL) return (EINVAL); if (!tsc_present) return (EOPNOTSUPP); /* If we're booting, trust the rate calibrated moments ago. */ if (cold) { *rate = tsc_freq; return (0); } #ifdef SMP /* Schedule ourselves on the indicated cpu. */ thread_lock(curthread); sched_bind(curthread, cpu_id); thread_unlock(curthread); #endif /* Calibrate by measuring a short delay. */ reg = intr_disable(); tsc1 = rdtsc(); DELAY(1000); tsc2 = rdtsc(); intr_restore(reg); #ifdef SMP thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); #endif /* * Calculate the difference in readings, convert to Mhz, and * subtract 0.5% of the total. Empirical testing has shown that * overhead in DELAY() works out to approximately this value. */ tsc2 -= tsc1; *rate = tsc2 * 1000 - tsc2 * 5; return (0); } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) __asm__ ("hlt"); } /* * Hook to idle the CPU when possible. In the SMP case we default to * off because a halted cpu will not currently pick up a new thread in the * run queue until the next timer tick. If turned on this will result in * approximately a 4.2% loss in real time performance in buildworld tests * (but improves user and sys times oddly enough), and saves approximately * 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3). * * XXX we need to have a cpu mask of idle cpus and generate an IPI or * otherwise generate some sort of interrupt to wake up cpus sitting in HLT. * Then we can have our cake and eat it too. * * XXX I'm turning it on for SMP as well by default for now. It seems to * help lock contention somewhat, and this is critical for HTT. -Peter */ static int cpu_idle_hlt = 1; TUNABLE_INT("machdep.cpu_idle_hlt", &cpu_idle_hlt); SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, &cpu_idle_hlt, 0, "Idle loop HLT enable"); static void cpu_idle_default(void) { /* * we must absolutely guarentee that hlt is the * absolute next instruction after sti or we * introduce a timing window. */ __asm __volatile("sti; hlt"); } /* * Note that we have to be careful here to avoid a race between checking * sched_runnable() and actually halting. If we don't do this, we may waste * the time between calling hlt and the next interrupt even though there * is a runnable process. */ void cpu_idle(void) { #ifdef SMP if (mp_grab_cpu_hlt()) return; #endif if (cpu_idle_hlt) { disable_intr(); if (sched_runnable()) enable_intr(); else (*cpu_idle_hook)(); } } /* Other subsystems (e.g., ACPI) can hook this later. */ void (*cpu_idle_hook)(void) = cpu_idle_default; /* * Clear registers on exec */ void exec_setregs(td, entry, stack, ps_strings) struct thread *td; u_long entry; u_long stack; u_long ps_strings; { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ pcb->pcb_gs = _udatasel; load_gs(_udatasel); mtx_lock_spin(&dt_lock); if (td->td_proc->p_md.md_ldt) user_ldt_free(td); else mtx_unlock_spin(&dt_lock); bzero((char *)regs, sizeof(struct trapframe)); regs->tf_eip = entry; regs->tf_esp = stack; regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_cs = _ucodesel; /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ regs->tf_ebx = ps_strings; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == PCPU_GET(curpcb)) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } pcb->pcb_flags &= ~PCB_DBREGS; } /* * Initialize the math emulator (if any) for the current process. * Actually, just clear the bit that says that the emulator has * been initialized. Initialization is delayed until the process * traps to the emulator (if it is done at all) mainly because * emulators don't provide an entry point for initialization. */ td->td_pcb->pcb_flags &= ~FP_SOFTFP; /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); /* * XXX - Linux emulator * Make sure sure edx is 0x0 on entry. Linux binaries depend * on it. */ td->td_retval[1] = 0; } void cpu_setregs(void) { unsigned int cr0; cr0 = rcr0(); /* * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: * * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT * instructions. We must set the CR0_MP bit and use the CR0_TS * bit to control the trap, because setting the CR0_EM bit does * not cause WAIT instructions to trap. It's important to trap * WAIT instructions - otherwise the "wait" variants of no-wait * control instructions would degenerate to the "no-wait" variants * after FP context switches but work correctly otherwise. It's * particularly important to trap WAITs when there is no NPX - * otherwise the "wait" variants would always degenerate. * * Try setting CR0_NE to get correct error reporting on 486DX's. * Setting it should fail or do nothing on lesser processors. */ cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; load_cr0(cr0); load_gs(_udatasel); } u_long bootdev; /* not a struct cdev *- encoding is different */ SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int _default_ldt; union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ struct region_descriptor r_gdt, r_idt; /* table descriptors */ struct mtx dt_lock; /* lock for GDT and LDT */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; #endif static struct i386tss dblfault_tss; static char dblfault_stack[PAGE_SIZE]; extern vm_offset_t proc0kstack; /* * software prototypes -- in more palatable form. * * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GUFS_SEL 2 %fs Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GUGS_SEL 3 %gs Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GCODE_SEL 4 Code Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GDATA_SEL 5 Data Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GUCODE_SEL 6 Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GUDATA_SEL 7 Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ { 0x400, /* segment base address */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { 0x0, /* segment base address */ sizeof(struct i386tss)-1,/* length */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GLDT_SEL 10 LDT Descriptor */ { (int) ldt, /* segment base address */ sizeof(ldt)-1, /* length - all address space */ SDT_SYSLDT, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GUSERLDT_SEL 11 User LDT Descriptor per process */ { (int) ldt, /* segment base address */ (512 * sizeof(union descriptor)-1), /* length */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GPANIC_SEL 12 Panic Tss Descriptor */ { (int) &dblfault_tss, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GNDIS_SEL 18 NDIS Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; void setidt(idx, func, typ, dpl, selec) int idx; inthand_t *func; int typ; int dpl; int selec; { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); #ifdef DDB /* * Display the index and function name of any IDT entries that don't use * the default 'rsvd' entry point. */ DB_SHOW_COMMAND(idt, db_show_idt) { struct gate_descriptor *ip; int idx; uintptr_t func; ip = idt; for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { func = (ip->gd_hioffset << 16 | ip->gd_looffset); if (func != (uintptr_t)&IDTVEC(rsvd)) { db_printf("%3d\t", idx); db_printsym(func, DB_STGY_PROC); db_printf("\n"); } ip++; } } /* Show privileged registers. */ DB_SHOW_COMMAND(sysregs, db_show_sysregs) { uint64_t idtr, gdtr; idtr = ridt(); db_printf("idtr\t0x%08x/%04x\n", (u_int)(idtr >> 16), (u_int)idtr & 0xffff); gdtr = rgdt(); db_printf("gdtr\t0x%08x/%04x\n", (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff); db_printf("ldtr\t0x%04x\n", rldt()); db_printf("tr\t0x%04x\n", rtr()); db_printf("cr0\t0x%08x\n", rcr0()); db_printf("cr2\t0x%08x\n", rcr2()); db_printf("cr3\t0x%08x\n", rcr3()); db_printf("cr4\t0x%08x\n", rcr4()); } #endif void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * If we cannot accurately determine the physical memory map, then use * value from the 0xE801 call, and failing that, the RTC. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * XXX first should be vm_paddr_t. */ static void getmemsize(int first) { int i, off, physmap_idx, pa_indx, da_indx; int hasbrokenint12, has_smap; u_long physmem_tunable; u_int extmem; struct vm86frame vmf; struct vm86context vmc; vm_paddr_t pa, physmap[PHYSMAP_SIZE]; pt_entry_t *pte; struct bios_smap *smap; quad_t dcons_addr, dcons_size; has_smap = 0; #ifdef XBOX if (arch_i386_is_xbox) { /* * We queried the memory size before, so chop off 4MB for * the framebuffer and inform the OS of this. */ physmap[0] = 0; physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE; physmap_idx = 0; goto physmap_done; } #endif hasbrokenint12 = 0; TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); bzero(&vmf, sizeof(vmf)); bzero(physmap, sizeof(physmap)); basemem = 0; /* * Some newer BIOSes has broken INT 12H implementation which cause * kernel panic immediately. In this case, we need to scan SMAP * with INT 15:E820 first, then determine base memory size. */ if (hasbrokenint12) { goto int15e820; } /* * Perform "base memory" related probes & setup */ vm86_intcall(0x12, &vmf); basemem = vmf.vmf_ax; if (basemem > 640) { printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", basemem); basemem = 640; } /* * XXX if biosbasemem is now < 640, there is a `hole' * between the end of base memory and the start of * ISA memory. The hole may be empty or it may * contain BIOS code or data. Map it read/write so * that the BIOS can write to it. (Memory from 0 to * the physical end of the kernel is mapped read-only * to begin with and then parts of it are remapped. * The parts that aren't remapped form holes that * remain read-only and are unused by the kernel. * The base memory area is below the physical end of * the kernel and right now forms a read-only hole. * The part of it from PAGE_SIZE to * (trunc_page(biosbasemem * 1024) - 1) will be * remapped and used by the kernel later.) * * This code is similar to the code used in * pmap_mapdev, but since no memory needs to be * allocated we simply change the mapping. */ for (pa = trunc_page(basemem * 1024); pa < ISA_HOLE_START; pa += PAGE_SIZE) pmap_kenter(KERNBASE + pa, pa); /* * Map pages between basemem and ISA_HOLE_START, if any, r/w into * the vm86 page table so that vm86 can scribble on them using * the vm86 map too. XXX: why 2 ways for this and only 1 way for * page 0, at least as initialized here? */ pte = (pt_entry_t *)vm86paddr; for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; int15e820: /* * map page 1 R/W into the kernel page table so we can use it * as a buffer. The kernel will unmap this page later. */ pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT); /* * get memory map with INT 15:E820 */ vmc.npages = 0; smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); physmap_idx = 0; vmf.vmf_ebx = 0; do { vmf.vmf_eax = 0xE820; vmf.vmf_edx = SMAP_SIG; vmf.vmf_ecx = sizeof(struct bios_smap); i = vm86_datacall(0x15, &vmf, &vmc); if (i || vmf.vmf_eax != SMAP_SIG) break; if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%016llx len=%016llx\n", smap->type, smap->base, smap->length); has_smap = 1; if (smap->type != SMAP_TYPE_MEMORY) continue; if (smap->length == 0) continue; #ifndef PAE if (smap->base >= 0xffffffff) { printf("%uK of memory above 4GB ignored\n", (u_int)(smap->length / 1024)); continue; } #endif for (i = 0; i <= physmap_idx; i += 2) { if (smap->base < physmap[i + 1]) { if (boothowto & RB_VERBOSE) printf( "Overlapping or non-monotonic memory region, ignoring second region\n"); continue; } } if (smap->base == physmap[physmap_idx + 1]) { physmap[physmap_idx + 1] += smap->length; continue; } physmap_idx += 2; if (physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); break; } physmap[physmap_idx] = smap->base; physmap[physmap_idx + 1] = smap->base + smap->length; } while (vmf.vmf_ebx != 0); /* * Perform "base memory" related probes & setup based on SMAP */ if (basemem == 0) { for (i = 0; i <= physmap_idx; i += 2) { if (physmap[i] == 0x00000000) { basemem = physmap[i + 1] / 1024; break; } } /* * XXX this function is horribly organized and has to the same * things that it does above here. */ if (basemem == 0) basemem = 640; if (basemem > 640) { printf( "Preposterous BIOS basemem of %uK, truncating to 640K\n", basemem); basemem = 640; } /* * Let vm86 scribble on pages between basemem and * ISA_HOLE_START, as above. */ for (pa = trunc_page(basemem * 1024); pa < ISA_HOLE_START; pa += PAGE_SIZE) pmap_kenter(KERNBASE + pa, pa); pte = (pt_entry_t *)vm86paddr; for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; } if (physmap[1] != 0) goto physmap_done; /* * If we failed above, try memory map with INT 15:E801 */ vmf.vmf_ax = 0xE801; if (vm86_intcall(0x15, &vmf) == 0) { extmem = vmf.vmf_cx + vmf.vmf_dx * 64; } else { #if 0 vmf.vmf_ah = 0x88; vm86_intcall(0x15, &vmf); extmem = vmf.vmf_ax; #else /* * Prefer the RTC value for extended memory. */ extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); #endif } /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. * * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) extmem = 15 * 1024; physmap[0] = 0; physmap[1] = basemem * 1024; physmap_idx = 2; physmap[physmap_idx] = 0x100000; physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; physmap_done: /* * Now, physmap contains a map of physical memory. */ #ifdef SMP /* make hole for AP bootstrap code */ physmap[1] = mp_bootaddress(physmap[1]); #endif /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) Maxmem = atop(physmem_tunable); /* * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend * the amount of memory in the system. */ if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) Maxmem = atop(physmap[physmap_idx + 1]); if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %ldK\n", Maxmem * 4); /* * If Maxmem has been increased beyond what the system has detected, * extend the last memory segment to the new limit. */ if (atop(physmap[physmap_idx + 1]) < Maxmem) physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(first); /* * Size up each available chunk of physical memory. */ physmap[0] = PAGE_SIZE; /* mask off page 0 */ pa_indx = 0; da_indx = 1; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; dump_avail[da_indx] = physmap[0]; pte = CMAP1; /* * Get dcons buffer address */ if (getenv_quad("dcons.addr", &dcons_addr) == 0 || getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ for (i = 0; i <= physmap_idx; i += 2) { vm_paddr_t end; end = ptoa((vm_paddr_t)Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad, full; int *ptr = (int *)CADDR1; full = FALSE; /* * block out kernel memory as not available. */ if (pa >= KERNLOAD && pa < first) goto do_dump_avail; /* * block out dcons buffer */ if (dcons_addr > 0 && pa >= trunc_page(dcons_addr) && pa < dcons_addr + dcons_size) goto do_dump_avail; page_bad = FALSE; /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | PG_N; invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) page_bad = TRUE; /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) page_bad = TRUE; /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) page_bad = TRUE; /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) page_bad = TRUE; /* * Restore original value. */ *(int *)ptr = tmp; /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) continue; /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; full = TRUE; goto do_dump_avail; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; do_dump_avail: if (dump_avail[da_indx] == pa) { dump_avail[da_indx] += PAGE_SIZE; } else { da_indx++; if (da_indx == DUMP_AVAIL_ARRAY_END) { da_indx--; goto do_next; } dump_avail[da_indx++] = pa; /* start */ dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ } do_next: if (full) break; } } *pte = 0; invltlb(); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); /* Map the message buffer. */ for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + off); } void init386(first) int first; { struct gate_descriptor *gdp; int gsel_tss, metadata_missing, x; struct pcpu *pc; thread0.td_kstack = proc0kstack; thread0.td_pcb = (struct pcb *) (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; /* * This may be done better later if it gets more high level * components in it. If so just link td->td_proc here. */ - proc_linkup(&proc0, &thread0); + proc_linkup0(&proc0, &thread0); metadata_missing = 0; if (bootinfo.bi_modulep) { preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; preload_bootstrap_relocate(KERNBASE); } else { metadata_missing = 1; } if (envmode == 1) kern_envp = static_env; else if (bootinfo.bi_envp) kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; /* Init basic tunables, hz etc */ init_param1(); /* * Make gdt memory segments. All segments cover the full 4GB * of address space and permissions are enforced at page level. */ gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); #ifdef SMP pc = &SMP_prvspace[0].pcpu; #else pc = &__pcpu; #endif gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); gdt_segs[GPRIV_SEL].ssd_base = (int) pc; gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; for (x = 0; x < NGDT; x++) ssdtosd(&gdt_segs[x], &gdt[x].sd); r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) gdt; mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); lgdt(&r_gdt); pcpu_init(pc, 0, sizeof(struct pcpu)); PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); PCPU_SET(curpcb, thread0.td_pcb); /* * Initialize mutexes. * * icu_lock: in order to allow an interrupt to occur in a critical * section, to set pcpu->ipending (etc...) properly, we * must be able to get the icu lock, so it can't be * under witness. */ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); /* make ldt memory segments */ ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL , GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); #ifdef XBOX /* * The following code queries the PCI ID of 0:0:0. For the XBOX, * This should be 0x10de / 0x02a5. * * This is exactly what Linux does. */ outl(0xcf8, 0x80000000); if (inl(0xcfc) == 0x02a510de) { arch_i386_is_xbox = 1; pic16l_setled(XBOX_LED_GREEN); /* * We are an XBOX, but we may have either 64MB or 128MB of * memory. The PCI host bridge should be programmed for this, * so we just query it. */ outl(0xcf8, 0x80000084); arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64; } #endif /* XBOX */ /* * Initialize the i8254 before the console so that console * initialization can use DELAY(). */ i8254_init(); /* * Initialize the console before we print anything out. */ cninit(); if (metadata_missing) printf("WARNING: loader(8) metadata is missing!\n"); #ifdef DEV_ISA elcr_probe(); atpic_startup(); #endif #ifdef DDB ksym_start = bootinfo.bi_symtab; ksym_end = bootinfo.bi_esymtab; #endif kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter("Boot flags requested debugger"); #endif finishidentcpu(); /* Final stage of CPU initialization */ setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); initializecpu(); /* Initialize CPU registers */ /* make an initial tss so cpu can get interrupt stack on syscall! */ /* Note: -16 is so we can grow the trapframe if we came from vm86 */ PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb) - 16); PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); ltr(gsel_tss); /* pointer to selector slot for %fs/%gs */ PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); #ifdef PAE dblfault_tss.tss_cr3 = (int)IdlePDPT; #else dblfault_tss.tss_cr3 = (int)IdlePTD; #endif dblfault_tss.tss_eip = (int)dblfault_handler; dblfault_tss.tss_eflags = PSL_KERNEL; dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); vm86_initialize(); getmemsize(first); init_param2(physmem); /* now running on new page tables, configured,and u/iom is accessible */ msgbufinit(msgbufp, MSGBUF_SIZE); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(lcall_syscall); gdp->gd_looffset = x; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = x >> 16; /* XXX does this work? */ /* XXX yes! */ ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; /* transfer to user mode */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); _udatasel = GSEL(GUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; #ifdef PAE thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; #else thread0.td_pcb->pcb_cr3 = (int)IdlePTD; #endif thread0.td_pcb->pcb_ext = 0; thread0.td_frame = &proc0_tf; } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = 0xffffffff; } void spinlock_enter(void) { struct thread *td; td = curthread; if (td->td_md.md_spinlock_count == 0) td->td_md.md_saved_flags = intr_disable(); td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; td = curthread; critical_exit(); td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) intr_restore(td->td_md.md_saved_flags); } #if defined(I586_CPU) && !defined(NO_F00F_HACK) static void f00f_hack(void *unused); SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL) static void f00f_hack(void *unused) { struct gate_descriptor *new_idt; vm_offset_t tmp; if (!has_f00f_bug) return; GIANT_REQUIRED; printf("Intel Pentium detected, installing workaround for F00F bug\n"); tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2); if (tmp == 0) panic("kmem_alloc returned 0"); /* Put the problematic entry (#6) at the end of the lower page. */ new_idt = (struct gate_descriptor*) (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); bcopy(idt, new_idt, sizeof(idt0)); r_idt.rd_base = (u_int)new_idt; lidt(&r_idt); idt = new_idt; if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE, VM_PROT_READ, FALSE) != KERN_SUCCESS) panic("vm_map_protect failed"); } #endif /* defined(I586_CPU) && !NO_F00F_HACK */ /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_edi = tf->tf_edi; pcb->pcb_esi = tf->tf_esi; pcb->pcb_ebp = tf->tf_ebp; pcb->pcb_ebx = tf->tf_ebx; pcb->pcb_eip = tf->tf_eip; pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; } int ptrace_set_pc(struct thread *td, u_long addr) { td->td_frame->tf_eip = addr; return (0); } int ptrace_single_step(struct thread *td) { td->td_frame->tf_eflags |= PSL_T; return (0); } int ptrace_clear_single_step(struct thread *td) { td->td_frame->tf_eflags &= ~PSL_T; return (0); } int fill_regs(struct thread *td, struct reg *regs) { struct pcb *pcb; struct trapframe *tp; tp = td->td_frame; pcb = td->td_pcb; regs->r_fs = tp->tf_fs; regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; regs->r_gs = pcb->pcb_gs; return (0); } int set_regs(struct thread *td, struct reg *regs) { struct pcb *pcb; struct trapframe *tp; tp = td->td_frame; if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); pcb = td->td_pcb; tp->tf_fs = regs->r_fs; tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; pcb->pcb_gs = regs->r_gs; return (0); } #ifdef CPU_ENABLE_SSE static void fill_fpregs_xmm(sv_xmm, sv_87) struct savexmm *sv_xmm; struct save87 *sv_87; { register struct env87 *penv_87 = &sv_87->sv_env; register struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; bzero(sv_87, sizeof(*sv_87)); /* FPU control/status */ penv_87->en_cw = penv_xmm->en_cw; penv_87->en_sw = penv_xmm->en_sw; penv_87->en_tw = penv_xmm->en_tw; penv_87->en_fip = penv_xmm->en_fip; penv_87->en_fcs = penv_xmm->en_fcs; penv_87->en_opcode = penv_xmm->en_opcode; penv_87->en_foo = penv_xmm->en_foo; penv_87->en_fos = penv_xmm->en_fos; /* FPU registers */ for (i = 0; i < 8; ++i) sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; } static void set_fpregs_xmm(sv_87, sv_xmm) struct save87 *sv_87; struct savexmm *sv_xmm; { register struct env87 *penv_87 = &sv_87->sv_env; register struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* FPU control/status */ penv_xmm->en_cw = penv_87->en_cw; penv_xmm->en_sw = penv_87->en_sw; penv_xmm->en_tw = penv_87->en_tw; penv_xmm->en_fip = penv_87->en_fip; penv_xmm->en_fcs = penv_87->en_fcs; penv_xmm->en_opcode = penv_87->en_opcode; penv_xmm->en_foo = penv_87->en_foo; penv_xmm->en_fos = penv_87->en_fos; /* FPU registers */ for (i = 0; i < 8; ++i) sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; } #endif /* CPU_ENABLE_SSE */ int fill_fpregs(struct thread *td, struct fpreg *fpregs) { #ifdef CPU_ENABLE_SSE if (cpu_fxsr) { fill_fpregs_xmm(&td->td_pcb->pcb_save.sv_xmm, (struct save87 *)fpregs); return (0); } #endif /* CPU_ENABLE_SSE */ bcopy(&td->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs); return (0); } int set_fpregs(struct thread *td, struct fpreg *fpregs) { #ifdef CPU_ENABLE_SSE if (cpu_fxsr) { set_fpregs_xmm((struct save87 *)fpregs, &td->td_pcb->pcb_save.sv_xmm); return (0); } #endif /* CPU_ENABLE_SSE */ bcopy(fpregs, &td->td_pcb->pcb_save.sv_87, sizeof *fpregs); return (0); } /* * Get machine context. */ int get_mcontext(struct thread *td, mcontext_t *mcp, int flags) { struct trapframe *tp; tp = td->td_frame; PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(tp->tf_esp); PROC_UNLOCK(curthread->td_proc); mcp->mc_gs = td->td_pcb->pcb_gs; mcp->mc_fs = tp->tf_fs; mcp->mc_es = tp->tf_es; mcp->mc_ds = tp->tf_ds; mcp->mc_edi = tp->tf_edi; mcp->mc_esi = tp->tf_esi; mcp->mc_ebp = tp->tf_ebp; mcp->mc_isp = tp->tf_isp; mcp->mc_eflags = tp->tf_eflags; if (flags & GET_MC_CLEAR_RET) { mcp->mc_eax = 0; mcp->mc_edx = 0; mcp->mc_eflags &= ~PSL_C; } else { mcp->mc_eax = tp->tf_eax; mcp->mc_edx = tp->tf_edx; } mcp->mc_ebx = tp->tf_ebx; mcp->mc_ecx = tp->tf_ecx; mcp->mc_eip = tp->tf_eip; mcp->mc_cs = tp->tf_cs; mcp->mc_esp = tp->tf_esp; mcp->mc_ss = tp->tf_ss; mcp->mc_len = sizeof(*mcp); get_fpcontext(td, mcp); return (0); } /* * Set machine context. * * However, we don't set any but the user modifiable flags, and we won't * touch the cs selector. */ int set_mcontext(struct thread *td, const mcontext_t *mcp) { struct trapframe *tp; int eflags, ret; tp = td->td_frame; if (mcp->mc_len != sizeof(*mcp)) return (EINVAL); eflags = (mcp->mc_eflags & PSL_USERCHANGE) | (tp->tf_eflags & ~PSL_USERCHANGE); if ((ret = set_fpcontext(td, mcp)) == 0) { tp->tf_fs = mcp->mc_fs; tp->tf_es = mcp->mc_es; tp->tf_ds = mcp->mc_ds; tp->tf_edi = mcp->mc_edi; tp->tf_esi = mcp->mc_esi; tp->tf_ebp = mcp->mc_ebp; tp->tf_ebx = mcp->mc_ebx; tp->tf_edx = mcp->mc_edx; tp->tf_ecx = mcp->mc_ecx; tp->tf_eax = mcp->mc_eax; tp->tf_eip = mcp->mc_eip; tp->tf_eflags = eflags; tp->tf_esp = mcp->mc_esp; tp->tf_ss = mcp->mc_ss; td->td_pcb->pcb_gs = mcp->mc_gs; ret = 0; } return (ret); } static void get_fpcontext(struct thread *td, mcontext_t *mcp) { #ifndef DEV_NPX mcp->mc_fpformat = _MC_FPFMT_NODEV; mcp->mc_ownedfp = _MC_FPOWNED_NONE; #else union savefpu *addr; /* * XXX mc_fpstate might be misaligned, since its declaration is not * unportabilized using __attribute__((aligned(16))) like the * declaration of struct savemm, and anyway, alignment doesn't work * for auto variables since we don't use gcc's pessimal stack * alignment. Work around this by abusing the spare fields after * mcp->mc_fpstate. * * XXX unpessimize most cases by only aligning when fxsave might be * called, although this requires knowing too much about * npxgetregs()'s internals. */ addr = (union savefpu *)&mcp->mc_fpstate; if (td == PCPU_GET(fpcurthread) && #ifdef CPU_ENABLE_SSE cpu_fxsr && #endif ((uintptr_t)(void *)addr & 0xF)) { do addr = (void *)((char *)addr + 4); while ((uintptr_t)(void *)addr & 0xF); } mcp->mc_ownedfp = npxgetregs(td, addr); if (addr != (union savefpu *)&mcp->mc_fpstate) { bcopy(addr, &mcp->mc_fpstate, sizeof(mcp->mc_fpstate)); bzero(&mcp->mc_spare2, sizeof(mcp->mc_spare2)); } mcp->mc_fpformat = npxformat(); #endif } static int set_fpcontext(struct thread *td, const mcontext_t *mcp) { union savefpu *addr; if (mcp->mc_fpformat == _MC_FPFMT_NODEV) return (0); else if (mcp->mc_fpformat != _MC_FPFMT_387 && mcp->mc_fpformat != _MC_FPFMT_XMM) return (EINVAL); else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) /* We don't care what state is left in the FPU or PCB. */ fpstate_drop(td); else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || mcp->mc_ownedfp == _MC_FPOWNED_PCB) { /* XXX align as above. */ addr = (union savefpu *)&mcp->mc_fpstate; if (td == PCPU_GET(fpcurthread) && #ifdef CPU_ENABLE_SSE cpu_fxsr && #endif ((uintptr_t)(void *)addr & 0xF)) { do addr = (void *)((char *)addr + 4); while ((uintptr_t)(void *)addr & 0xF); bcopy(&mcp->mc_fpstate, addr, sizeof(mcp->mc_fpstate)); } #ifdef DEV_NPX #ifdef CPU_ENABLE_SSE if (cpu_fxsr) addr->sv_xmm.sv_env.en_mxcsr &= cpu_mxcsr_mask; #endif /* * XXX we violate the dubious requirement that npxsetregs() * be called with interrupts disabled. */ npxsetregs(td, addr); #endif /* * Don't bother putting things back where they were in the * misaligned case, since we know that the caller won't use * them again. */ } else return (EINVAL); return (0); } static void fpstate_drop(struct thread *td) { register_t s; s = intr_disable(); #ifdef DEV_NPX if (PCPU_GET(fpcurthread) == td) npxdrop(); #endif /* * XXX force a full drop of the npx. The above only drops it if we * owned it. npxgetregs() has the same bug in the !cpu_fxsr case. * * XXX I don't much like npxgetregs()'s semantics of doing a full * drop. Dropping only to the pcb matches fnsave's behaviour. * We only need to drop to !PCB_INITDONE in sendsig(). But * sendsig() is the only caller of npxgetregs()... perhaps we just * have too many layers. */ curthread->td_pcb->pcb_flags &= ~PCB_NPXINITDONE; intr_restore(s); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; if (td == NULL) { dbregs->dr[0] = rdr0(); dbregs->dr[1] = rdr1(); dbregs->dr[2] = rdr2(); dbregs->dr[3] = rdr3(); dbregs->dr[4] = rdr4(); dbregs->dr[5] = rdr5(); dbregs->dr[6] = rdr6(); dbregs->dr[7] = rdr7(); } else { pcb = td->td_pcb; dbregs->dr[0] = pcb->pcb_dr0; dbregs->dr[1] = pcb->pcb_dr1; dbregs->dr[2] = pcb->pcb_dr2; dbregs->dr[3] = pcb->pcb_dr3; dbregs->dr[4] = 0; dbregs->dr[5] = 0; dbregs->dr[6] = pcb->pcb_dr6; dbregs->dr[7] = pcb->pcb_dr7; } return (0); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; int i; if (td == NULL) { load_dr0(dbregs->dr[0]); load_dr1(dbregs->dr[1]); load_dr2(dbregs->dr[2]); load_dr3(dbregs->dr[3]); load_dr4(dbregs->dr[4]); load_dr5(dbregs->dr[5]); load_dr6(dbregs->dr[6]); load_dr7(dbregs->dr[7]); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP. */ for (i = 0; i < 4; i++) { if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) return (EINVAL); if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02) return (EINVAL); } pcb = td->td_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { /* dr0 is enabled */ if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { /* dr1 is enabled */ if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { /* dr2 is enabled */ if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { /* dr3 is enabled */ if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) return (EINVAL); } pcb->pcb_dr0 = dbregs->dr[0]; pcb->pcb_dr1 = dbregs->dr[1]; pcb->pcb_dr2 = dbregs->dr[2]; pcb->pcb_dr3 = dbregs->dr[3]; pcb->pcb_dr6 = dbregs->dr[6]; pcb->pcb_dr7 = dbregs->dr[7]; pcb->pcb_flags |= PCB_DBREGS; } return (0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(void) { u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */ u_int32_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return 0; } nbp = 0; dr6 = rdr6(); bp = dr6 & 0x0000000f; if (!bp) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return 0; } /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i = 0; i < nbp; i++) { if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { /* * addr[i] is in user space */ return nbp; } } /* * None of the breakpoints are in user space. */ return 0; } #ifndef DEV_APIC #include /* * Provide stub functions so that the MADT APIC enumerator in the acpi * kernel module will link against a kernel without 'device apic'. * * XXX - This is a gross hack. */ void apic_register_enumerator(struct apic_enumerator *enumerator) { } void * ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase) { return (NULL); } int ioapic_disable_pin(void *cookie, u_int pin) { return (ENXIO); } int ioapic_get_vector(void *cookie, u_int pin) { return (-1); } void ioapic_register(void *cookie) { } int ioapic_remap_vector(void *cookie, u_int pin, int vector) { return (ENXIO); } int ioapic_set_extint(void *cookie, u_int pin) { return (ENXIO); } int ioapic_set_nmi(void *cookie, u_int pin) { return (ENXIO); } int ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol) { return (ENXIO); } int ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger) { return (ENXIO); } void lapic_create(u_int apic_id, int boot_cpu) { } void lapic_init(vm_paddr_t addr) { } int lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode) { return (ENXIO); } int lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol) { return (ENXIO); } int lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger) { return (ENXIO); } #endif #ifdef KDB /* * Provide inb() and outb() as functions. They are normally only * available as macros calling inlined functions, thus cannot be * called from the debugger. * * The actual code is stolen from , and de-inlined. */ #undef inb #undef outb /* silence compiler warnings */ u_char inb(u_int); void outb(u_int, u_char); u_char inb(u_int port) { u_char data; /* * We use %%dx and not %1 here because i/o is done at %dx and not at * %edx, while gcc generates inferior code (movw instead of movl) * if we tell it to load (u_short) port. */ __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); return (data); } void outb(u_int port, u_char data) { u_char al; /* * Use an unnecessary assignment to help gcc's register allocator. * This make a large difference for gcc-1.40 and a tiny difference * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for * best results. gcc-2.6.0 can't handle this. */ al = data; __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); } #endif /* KDB */ Index: head/sys/i386/i386/pmap.c =================================================================== --- head/sys/i386/i386/pmap.c (revision 173360) +++ head/sys/i386/i386/pmap.c (revision 173361) @@ -1,3670 +1,3677 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_cpu.h" #include "opt_pmap.h" #include "opt_msgbuf.h" #include "opt_smp.h" #include "opt_xbox.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifdef XBOX #include #endif #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) #define CPU_ENABLE_SSE #endif #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif #if !defined(PMAP_DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif #define PV_STATS #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ atomic_clear_int((u_int *)(pte), PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int pgeflag = 0; /* PG_G or-in */ int pseflag = 0; /* PG_PS or-in */ static int nkpt; vm_offset_t kernel_vm_end; extern u_int32_t KERNend; #ifdef PAE pt_entry_t pg_nx; static uma_zone_t pdptzone; #endif /* * Data for the pv entry allocation mechanism */ static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; static int shpgperproc = PMAP_SHPGPERPROC; struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ int pv_maxchunks; /* How many chunks we have KVA for */ vm_offset_t pv_vafree; /* freelist stored in the PTE */ /* * All those kernel PT submaps that BSD is so fond of */ struct sysmaps { struct mtx lock; pt_entry_t *CMAP1; pt_entry_t *CMAP2; caddr_t CADDR1; caddr_t CADDR2; }; static struct sysmaps sysmaps_pcpu[MAXCPU]; pt_entry_t *CMAP1 = 0; static pt_entry_t *CMAP3; caddr_t CADDR1 = 0, ptvmmap = 0; static caddr_t CADDR3; struct msgbuf *msgbufp = 0; /* * Crashdump maps. */ static caddr_t crashdumpmap; #ifdef SMP extern pt_entry_t *SMPpt; #endif static pt_entry_t *PMAP1 = 0, *PMAP2; static pt_entry_t *PADDR1 = 0, *PADDR2; #ifdef SMP static int PMAP1cpu; static int PMAP1changedcpu; SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, &PMAP1changedcpu, 0, "Number of times pmap_pte_quick changed CPU with same PMAP1"); #endif static int PMAP1changed; SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, &PMAP1changed, 0, "Number of times pmap_pte_quick changed PMAP1"); static int PMAP1unchanged; SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, &PMAP1unchanged, 0, "Number of times pmap_pte_quick didn't change PMAP1"); static struct mtx PMAP2mutex; static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, vm_page_t *free); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, vm_page_t *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags); static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free); static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); static void pmap_pte_release(pt_entry_t *pte); static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *); static vm_offset_t pmap_kmem_choose(vm_offset_t addr); #ifdef PAE static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); #endif CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); /* * Move the kernel virtual free pointer to the next * 4MB. This is used to help improve performance * by using a large (4MB) page for much of the kernel * (.text, .data, .bss) */ static vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; #ifndef DISABLE_PSE if (cpu_feature & CPUID_PSE) newaddr = (addr + PDRMASK) & ~PDRMASK; #endif return newaddr; } /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t firstaddr) { vm_offset_t va; pt_entry_t *pte, *unused; struct sysmaps *sysmaps; int i; /* * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too * large. It should instead be correctly calculated in locore.s and * not based on 'first' (which is a physical address, not a virtual * address, for the start of unused physical memory). The kernel * page tables are NOT double mapped and thus should not be included * in this calculation. */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_avail = pmap_kmem_choose(virtual_avail); virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); #ifdef PAE kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); #endif kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); LIST_INIT(&allpmaps); mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. * CMAP3 is used for the idle process page zeroing. */ for (i = 0; i < MAXCPU; i++) { sysmaps = &sysmaps_pcpu[i]; mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) } SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP3, CADDR3, 1) *CMAP3 = 0; /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. */ SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE))) /* * ptemap is used for pmap_pte_quick */ SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1); SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1); mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); virtual_avail = va; *CMAP1 = 0; /* * Leave in place an identity mapping (virt == phys) for the low 1 MB * physical memory region that is used by the ACPI wakeup code. This * mapping must not have PG_G set. */ #ifdef XBOX /* FIXME: This is gross, but needed for the XBOX. Since we are in such * an early stadium, we cannot yet neatly map video memory ... :-( * Better fixes are very welcome! */ if (!arch_i386_is_xbox) #endif for (i = 1; i < NKPT; i++) PTD[i] = 0; /* Initialize the PAT MSR if present. */ pmap_init_pat(); /* Turn on PG_G on kernel page(s) */ pmap_set_pg(); } /* * Setup the PAT MSR. */ void pmap_init_pat(void) { uint64_t pat_msr; /* Bail if this CPU doesn't implement PAT. */ if (!(cpu_feature & CPUID_PAT)) return; #ifdef PAT_WORKS /* * Leave the indices 0-3 at the default of WB, WT, UC, and UC-. * Program 4 and 5 as WP and WC. * Leave 6 and 7 as UC and UC-. */ pat_msr = rdmsr(MSR_PAT); pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5)); pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) | PAT_VALUE(5, PAT_WRITE_COMBINING); #else /* * Due to some Intel errata, we can only safely use the lower 4 * PAT entries. Thus, just replace PAT Index 2 with WC instead * of UC-. * * Intel Pentium III Processor Specification Update * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B * or Mode C Paging) * * Intel Pentium IV Processor Specification Update * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) */ pat_msr = rdmsr(MSR_PAT); pat_msr &= ~PAT_MASK(2); pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); #endif wrmsr(MSR_PAT, pat_msr); } /* * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. */ void pmap_set_pg(void) { pd_entry_t pdir; pt_entry_t *pte; vm_offset_t va, endva; int i; if (pgeflag == 0) return; i = KERNLOAD/NBPDR; endva = KERNBASE + KERNend; if (pseflag) { va = KERNBASE + KERNLOAD; while (va < endva) { pdir = kernel_pmap->pm_pdir[KPTDI+i]; pdir |= pgeflag; kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir; invltlb(); /* Play it safe, invltlb() every time */ i++; va += NBPDR; } } else { va = (vm_offset_t)btext; while (va < endva) { pte = vtopte(va); if (*pte) *pte |= pgeflag; invltlb(); /* Play it safe, invltlb() every time */ va += PAGE_SIZE; } } } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_list_count = 0; } #ifdef PAE static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt"); static void * pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) { *flags = UMA_SLAB_PRIV; return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL, 1, 0)); } #endif /* * ABuse the pte nodes for unmapped kva to thread a kva freelist through. * Requirements: * - Must deal with pages in order to ensure that none of the PG_* bits * are ever set, PG_V in particular. * - Assumes we can write to ptes without pte_store() atomic ops, even * on PAE systems. This should be ok. * - Assumes nothing will ever test these addresses for 0 to indicate * no mapping instead of correctly checking PG_V. * - Assumes a vm_offset_t will fit in a pte (true for i386). * Because PG_V is never set, there can be no mappings to invalidate. */ static vm_offset_t pmap_ptelist_alloc(vm_offset_t *head) { pt_entry_t *pte; vm_offset_t va; va = *head; if (va == 0) return (va); /* Out of memory */ pte = vtopte(va); *head = *pte; if (*head & PG_V) panic("pmap_ptelist_alloc: va with PG_V set!"); *pte = 0; return (va); } static void pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) { pt_entry_t *pte; if (va & PG_V) panic("pmap_ptelist_free: freeing va with PG_V set!"); pte = vtopte(va); *pte = *head; /* virtual! PG_V is 0 though */ *head = va; } static void pmap_ptelist_init(vm_offset_t *head, void *base, int npages) { int i; vm_offset_t va; *head = 0; for (i = npages - 1; i >= 0; i--) { va = (vm_offset_t)base + i * PAGE_SIZE; pmap_ptelist_free(head, va); } } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map, PAGE_SIZE * pv_maxchunks); if (pv_chunkbase == NULL) panic("pmap_init: not enough kvm for pv chunks"); pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); #ifdef PAE pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); #endif } SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, "Max number of PV entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, "Page share factor per proc"); /*************************************************** * Low level helper routines..... ***************************************************/ /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ static int pmap_cache_bits(int mode, boolean_t is_pde) { int pat_flag, pat_index, cache_bits; /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; /* If we don't support PAT, map extended modes to older ones. */ if (!(cpu_feature & CPUID_PAT)) { switch (mode) { case PAT_UNCACHEABLE: case PAT_WRITE_THROUGH: case PAT_WRITE_BACK: break; case PAT_UNCACHED: case PAT_WRITE_COMBINING: case PAT_WRITE_PROTECTED: mode = PAT_UNCACHEABLE; break; } } /* Map the caching mode to a PAT index. */ switch (mode) { #ifdef PAT_WORKS case PAT_UNCACHEABLE: pat_index = 3; break; case PAT_WRITE_THROUGH: pat_index = 1; break; case PAT_WRITE_BACK: pat_index = 0; break; case PAT_UNCACHED: pat_index = 2; break; case PAT_WRITE_COMBINING: pat_index = 5; break; case PAT_WRITE_PROTECTED: pat_index = 4; break; #else case PAT_UNCACHED: case PAT_UNCACHEABLE: case PAT_WRITE_PROTECTED: pat_index = 3; break; case PAT_WRITE_THROUGH: pat_index = 1; break; case PAT_WRITE_BACK: pat_index = 0; break; case PAT_WRITE_COMBINING: pat_index = 2; break; #endif default: panic("Unknown caching mode %d\n", mode); } /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_index & 0x4) cache_bits |= pat_flag; if (pat_index & 0x2) cache_bits |= PG_NC_PCD; if (pat_index & 0x1) cache_bits |= PG_NC_PWT; return (cache_bits); } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { u_int cpumask; u_int other_cpus; sched_pin(); if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { invlpg(va); smp_invlpg(va); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) invlpg(va); if (pmap->pm_active & other_cpus) smp_masked_invlpg(pmap->pm_active & other_cpus, va); } sched_unpin(); } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { u_int cpumask; u_int other_cpus; vm_offset_t addr; sched_pin(); if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); smp_invlpg_range(sva, eva); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); if (pmap->pm_active & other_cpus) smp_masked_invlpg_range(pmap->pm_active & other_cpus, sva, eva); } sched_unpin(); } void pmap_invalidate_all(pmap_t pmap) { u_int cpumask; u_int other_cpus; sched_pin(); if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { invltlb(); smp_invltlb(); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) invltlb(); if (pmap->pm_active & other_cpus) smp_masked_invltlb(pmap->pm_active & other_cpus); } sched_unpin(); } void pmap_invalidate_cache(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, 486+ invalidation functions. * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || pmap->pm_active) invlpg(va); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap || pmap->pm_active) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap || pmap->pm_active) invltlb(); } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } #endif /* !SMP */ /* * Are we current address space or kernel? N.B. We return FALSE when * a pmap's page table is in use because a kernel thread is borrowing * it. The borrowed page table can change spontaneously, making any * dependence on its continued use subject to a race condition. */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); } /* * If the given pmap is not the current or kernel pmap, the returned pte must * be released by passing it to pmap_pte_release(). */ pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); mtx_lock(&PMAP2mutex); newpf = *pde & PG_FRAME; if ((*PMAP2 & PG_FRAME) != newpf) { *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } /* * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte * being NULL. */ static __inline void pmap_pte_release(pt_entry_t *pte) { if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) mtx_unlock(&PMAP2mutex); } static __inline void invlcaddr(void *caddr) { invlpg((u_int)caddr); } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. * * If the given pmap is not the current pmap, vm_page_queue_mtx * must be held and curthread pinned to a CPU. */ static pt_entry_t * pmap_pte_quick(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); mtx_assert(&vm_page_queue_mtx, MA_OWNED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); newpf = *pde & PG_FRAME; if ((*PMAP1 & PG_FRAME) != newpf) { *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t rtval; pt_entry_t *pte; pd_entry_t pde; rtval = 0; PMAP_LOCK(pmap); pde = pmap->pm_pdir[va >> PDRSHIFT]; if (pde != 0) { if ((pde & PG_PS) != 0) { rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); PMAP_UNLOCK(pmap); return rtval; } pte = pmap_pte(pmap, va); rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); pmap_pte_release(pte); } PMAP_UNLOCK(pmap); return (rtval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde; pt_entry_t pte; vm_page_t m; m = NULL; vm_page_lock_queues(); PMAP_LOCK(pmap); pde = *pmap_pde(pmap, va); if (pde != 0) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); vm_page_hold(m); } } else { sched_pin(); pte = *pmap_pte_quick(pmap, va); if (pte != 0 && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { m = PHYS_TO_VM_PAGE(pte & PG_FRAME); vm_page_hold(m); } sched_unpin(); } } vm_page_unlock_queues(); PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag); } PMAP_INLINE void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; va = sva = *virt; while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, *pte; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { oldpte |= *pte; pte_store(pte, VM_PAGE_TO_PHYS(*ma) | pgeflag | PG_RW | PG_V); pte++; ma++; } if ((oldpte & PG_V) != 0) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ static PMAP_INLINE void pmap_free_zero_pages(vm_page_t free) { vm_page_t m; while (free != NULL) { m = free; free = m->right; vm_page_free_zero(m); } } /* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ static PMAP_INLINE int pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free) { --m->wire_count; if (m->wire_count == 0) return _pmap_unwire_pte_hold(pmap, m, free); else return 0; } static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; atomic_subtract_int(&cnt.v_wire_count, 1); /* * Do an invltlb to make the invalidated mapping * take effect immediately. */ pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); pmap_invalidate_page(pmap, pteva); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ m->right = *free; *free = m; return 1; } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free) { pd_entry_t ptepde; vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return 0; ptepde = *pmap_pde(pmap, va); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return pmap_unwire_pte_hold(pmap, mpte, free); } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); #ifdef PAE pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); #endif pmap->pm_active = 0; PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ -void +int pmap_pinit(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; vm_paddr_t pa; static int color; int i; PMAP_LOCK_INIT(pmap); /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) { pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map, NBPTD); + + if (pmap->pm_pdir == NULL) { + PMAP_LOCK_DESTROY(pmap); + return (0); + } #ifdef PAE pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); KASSERT(((vm_offset_t)pmap->pm_pdpt & ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, ("pmap_pinit: pdpt misaligned")); KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif } /* * allocate the page directory page(s) */ for (i = 0; i < NPGPTD;) { m = vm_page_alloc(NULL, color++, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) VM_WAIT; else { ptdpg[i++] = m; } } pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); for (i = 0; i < NPGPTD; i++) { if ((ptdpg[i]->flags & PG_ZERO) == 0) bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE); } mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* Wire in kernel global address entries. */ /* XXX copies current process, does not fill in MPPTDI */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); #ifdef SMP pmap->pm_pdir[MPPTDI] = PTD[MPPTDI]; #endif /* install self-referential address mapping entry(s) */ for (i = 0; i < NPGPTD; i++) { pa = VM_PAGE_TO_PHYS(ptdpg[i]); pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; #ifdef PAE pmap->pm_pdpt[i] = pa | PG_V; #endif } pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + + return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags) { vm_paddr_t ptepa; vm_page_t m; KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (flags & M_WAITOK) { PMAP_UNLOCK(pmap); vm_page_unlock_queues(); VM_WAIT; vm_page_lock_queues(); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); return m; } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) { unsigned ptepindex; pd_entry_t ptepa; vm_page_t m; KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; retry: /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { pmap->pm_pdir[ptepindex] = 0; ptepa = 0; pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; pmap_invalidate_all(kernel_pmap); } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & M_WAITOK)) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ #ifdef SMP /* * Deal with a SMP shootdown of other users of the pmap that we are * trying to dispose of. This can be a bit hairy. */ static u_int *lazymask; static u_int lazyptd; static volatile u_int lazywait; void pmap_lazyfix_action(void); void pmap_lazyfix_action(void) { u_int mymask = PCPU_GET(cpumask); #ifdef COUNT_IPIS *ipi_lazypmap_counts[PCPU_GET(cpuid)]++; #endif if (rcr3() == lazyptd) load_cr3(PCPU_GET(curpcb)->pcb_cr3); atomic_clear_int(lazymask, mymask); atomic_store_rel_int(&lazywait, 1); } static void pmap_lazyfix_self(u_int mymask) { if (rcr3() == lazyptd) load_cr3(PCPU_GET(curpcb)->pcb_cr3); atomic_clear_int(lazymask, mymask); } static void pmap_lazyfix(pmap_t pmap) { u_int mymask; u_int mask; u_int spins; while ((mask = pmap->pm_active) != 0) { spins = 50000000; mask = mask & -mask; /* Find least significant set bit */ mtx_lock_spin(&smp_ipi_mtx); #ifdef PAE lazyptd = vtophys(pmap->pm_pdpt); #else lazyptd = vtophys(pmap->pm_pdir); #endif mymask = PCPU_GET(cpumask); if (mask == mymask) { lazymask = &pmap->pm_active; pmap_lazyfix_self(mymask); } else { atomic_store_rel_int((u_int *)&lazymask, (u_int)&pmap->pm_active); atomic_store_rel_int(&lazywait, 0); ipi_selected(mask, IPI_LAZYPMAP); while (lazywait == 0) { ia32_pause(); if (--spins == 0) break; } } mtx_unlock_spin(&smp_ipi_mtx); if (spins == 0) printf("pmap_lazyfix: spun for 50000000\n"); } } #else /* SMP */ /* * Cleaning up on uniprocessor is easy. For various reasons, we're * unlikely to have to even execute this code, including the fact * that the cleanup is deferred until the parent does a wait(2), which * means that another userland process has run. */ static void pmap_lazyfix(pmap_t pmap) { u_int cr3; cr3 = vtophys(pmap->pm_pdir); if (cr3 == rcr3()) { load_cr3(PCPU_GET(curpcb)->pcb_cr3); pmap->pm_active &= ~(PCPU_GET(cpumask)); } } #endif /* SMP */ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); pmap_lazyfix(pmap); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); for (i = 0; i < NPGPTD; i++) ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & PG_FRAME); bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * sizeof(*pmap->pm_pdir)); #ifdef SMP pmap->pm_pdir[MPPTDI] = 0; #endif pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); for (i = 0; i < NPGPTD; i++) { m = ptdpg[i]; #ifdef PAE KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), ("pmap_release: got wrong ptd page")); #endif m->wire_count--; atomic_subtract_int(&cnt.v_wire_count, 1); vm_page_free_zero(m); } PMAP_LOCK_DESTROY(pmap); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { struct pmap *pmap; vm_paddr_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; pt_entry_t *pde; mtx_assert(&kernel_map->system_mtx, MA_OWNED); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); nkpt++; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } addr = roundup2(addr, PAGE_SIZE * NPTEPG); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } /* * This index is bogus, but out of the way */ nkpg = vm_page_alloc(NULL, nkpt, VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; pmap_zero_page(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(PTD, kernel_vm_end) = newpdir; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { pde = pmap_pde(pmap, kernel_vm_end); pde_store(pde, newpdir); } mtx_unlock_spin(&allpmaps_lock); kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 11); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ static uint32_t pc_freemask[11] = { PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 }; SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); static int pmap_collect_inactive, pmap_collect_active; SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0, "Current number times pmap_collect called on inactive queue"); SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0, "Current number times pmap_collect called on active queue"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. This is normally called to * unmap inactive pages, and if necessary, active pages. */ static void pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) { pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t next_pv, pv; vm_offset_t va; vm_page_t m, free; sched_pin(); TAILQ_FOREACH(m, &vpq->pl, pageq) { if (m->hold_count || m->busy) continue; TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) continue; pmap->pm_stats.resident_count--; pte = pmap_pte_quick(pmap, va); tpte = pte_load_clear(pte); KASSERT((tpte & PG_W) == 0, ("pmap_collect: wired pte %#jx", (uintmax_t)tpte)); if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); if (tpte & PG_M) { KASSERT((tpte & PG_RW), ("pmap_collect: modified page not writable: va: %#x, pte: %#jx", va, (uintmax_t)tpte)); vm_page_dirty(m); } free = NULL; pmap_unuse_pt(pmap, va, &free); pmap_invalidate_page(pmap, va); pmap_free_zero_pages(free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); m->md.pv_list_count--; free_pv_entry(pmap, pv); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } } sched_unpin(); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { vm_page_t m; struct pv_chunk *pc; int idx, field, bit; mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 32; bit = idx % 32; pc->pc_map[field] |= 1ul << bit; /* move to head of list */ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) return; PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire(m, 0); vm_page_free(m); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, int try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; static vm_pindex_t colour; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entry_max tunable.\n"); pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfl(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 32 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); m = vm_page_alloc(NULL, colour, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL || pc == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); if (m) { vm_page_lock_queues(); vm_page_unwire(m, 0); vm_page_free(m); vm_page_unlock_queues(); } if (pc) pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); return (NULL); } /* * Reclaim pv entries: At first, destroy mappings to * inactive pages. After that, if a pv chunk entry * is still needed, destroy mappings to active pages. */ PV_STAT(pmap_collect_inactive++); pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]); if (m == NULL) m = vm_page_alloc(NULL, colour, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (pc == NULL) pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); if (m == NULL || pc == NULL) { PV_STAT(pmap_collect_active++); pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]); if (m == NULL) m = vm_page_alloc(NULL, colour, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (pc == NULL) pc = (struct pv_chunk *) pmap_ptelist_alloc(&pv_vafree); if (m == NULL || pc == NULL) panic("get_pv_entry: increase vm.pmap.shpgperproc"); } } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); colour++; pmap_qenter((vm_offset_t)pc, &m, 1); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) break; } KASSERT(pv != NULL, ("pmap_remove_entry: pv not found")); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); free_pv_entry(pmap, pv); } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; return (TRUE); } else return (FALSE); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free) { pt_entry_t oldpte; vm_page_t m; mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if (oldpte & PG_M) { KASSERT((oldpte & PG_RW), ("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx", va, (uintmax_t)oldpte)); vm_page_dirty(m); } if (oldpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free) { pt_entry_t *pte; mtx_assert(&vm_page_queue_mtx, MA_OWNED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) return; pmap_remove_pte(pmap, pte, va, free); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; vm_page_t free = NULL; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; vm_page_lock_queues(); sched_pin(); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + PAGE_SIZE == eva) && ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva, &free); goto out; } for (; sva < eva; sva = pdnxt) { unsigned pdirindex; /* * Calculate index for next page table. */ pdnxt = (sva + NBPDR) & ~PDRMASK; if (pmap->pm_stats.resident_count == 0) break; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { pmap->pm_pdir[pdirindex] = 0; pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anyvalid = 1; continue; } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if (*pte == 0) continue; /* * The TLB entry for a PG_G mapping is invalidated * by pmap_remove_pte(). */ if ((*pte & PG_G) == 0) anyvalid = 1; if (pmap_remove_pte(pmap, pte, sva, &free)) break; } } out: sched_unpin(); if (anyvalid) pmap_invalidate_all(pmap); vm_page_unlock_queues(); PMAP_UNLOCK(pmap); pmap_free_zero_pages(free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pt_entry_t *pte, tpte; vm_page_t free; #if defined(PMAP_DIAGNOSTIC) /* * XXX This makes pmap_remove_all() illegal for non-managed pages! */ if (m->flags & PG_FICTITIOUS) { panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x", VM_PAGE_TO_PHYS(m)); } #endif mtx_assert(&vm_page_queue_mtx, MA_OWNED); sched_pin(); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pte = pmap_pte_quick(pmap, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { KASSERT((tpte & PG_RW), ("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx", pv->pv_va, (uintmax_t)tpte)); vm_page_dirty(m); } free = NULL; pmap_unuse_pt(pmap, pv->pv_va, &free); pmap_invalidate_page(pmap, pv->pv_va); pmap_free_zero_pages(free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_flag_clear(m, PG_WRITEABLE); sched_unpin(); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; int anychanged; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } #ifdef PAE if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; #else if (prot & VM_PROT_WRITE) return; #endif anychanged = 0; vm_page_lock_queues(); sched_pin(); PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pt_entry_t obits, pbits; unsigned pdirindex; pdnxt = (sva + NBPDR) & ~PDRMASK; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { if ((prot & VM_PROT_WRITE) == 0) pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); #ifdef PAE if ((prot & VM_PROT_EXECUTE) == 0) pmap->pm_pdir[pdirindex] |= pg_nx; #endif anychanged = 1; continue; } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { vm_page_t m; retry: /* * Regardless of whether a pte is 32 or 64 bits in * size, PG_RW, PG_A, and PG_M are among the least * significant 32 bits. */ obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if (pbits & PG_MANAGED) { m = NULL; if (pbits & PG_A) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_flag_set(m, PG_REFERENCED); pbits &= ~PG_A; } if ((pbits & PG_M) != 0) { if (m == NULL) m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } } if ((prot & VM_PROT_WRITE) == 0) pbits &= ~(PG_RW | PG_M); #ifdef PAE if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; #endif if (pbits != obits) { #ifdef PAE if (!atomic_cmpset_64(pte, obits, pbits)) goto retry; #else if (!atomic_cmpset_int((u_int *)pte, obits, pbits)) goto retry; #endif if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = 1; } } } sched_unpin(); if (anychanged) pmap_invalidate_all(pmap); vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { vm_paddr_t pa; pd_entry_t *pde; pt_entry_t *pte; vm_paddr_t opa; pt_entry_t origpte, newpte; vm_page_t mpte, om; boolean_t invlva; va = trunc_page(va); #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); #endif mpte = NULL; vm_page_lock_queues(); PMAP_LOCK(pmap); sched_pin(); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va, M_WAITOK); } #if 0 && defined(PMAP_DIAGNOSTIC) else { pd_entry_t *pdeaddr = pmap_pde(pmap, va); origpte = *pdeaddr; if ((origpte & PG_V) == 0) { panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n", pmap->pm_pdir[PTDPTDI], origpte, va); } } #endif pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) panic("pmap_enter: attempted pmap_enter on 4MB page"); pte = pmap_pte_quick(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n", (uintmax_t)pmap->pm_pdir[PTDPTDI], va); } pa = VM_PAGE_TO_PHYS(m); om = NULL; origpte = *pte; opa = origpte & PG_FRAME; /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; /* * Remove extra pte reference */ if (mpte) mpte->wire_count--; /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { om = m; pa |= PG_MANAGED; } goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (origpte & PG_W) pmap->pm_stats.wired_count--; if (origpte & PG_MANAGED) { om = PHYS_TO_VM_PAGE(opa); pmap_remove_entry(pmap, om, va); } if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%x", va)); } } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); pmap_insert_entry(pmap, va, m); pa |= PG_MANAGED; } /* * Increment counters */ if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (pt_entry_t)(pa | PG_V); if ((prot & VM_PROT_WRITE) != 0) { newpte |= PG_RW; vm_page_flag_set(m, PG_WRITEABLE); } #ifdef PAE if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; #endif if (wired) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { if (origpte & PG_V) { invlva = FALSE; origpte = pte_load_store(pte, newpte | PG_A); if (origpte & PG_A) { if (origpte & PG_MANAGED) vm_page_flag_set(om, PG_REFERENCED); if (opa != VM_PAGE_TO_PHYS(m)) invlva = TRUE; #ifdef PAE if ((origpte & PG_NX) == 0 && (newpte & PG_NX) != 0) invlva = TRUE; #endif } if (origpte & PG_M) { KASSERT((origpte & PG_RW), ("pmap_enter: modified page not writable: va: %#x, pte: %#jx", va, (uintmax_t)origpte)); if ((origpte & PG_MANAGED) != 0) vm_page_dirty(om); if ((prot & VM_PROT_WRITE) == 0) invlva = TRUE; } if (invlva) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte | PG_A); } sched_unpin(); vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED); psize = atop(end - start); mpte = NULL; m = m_start; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot, mpte); m = TAILQ_NEXT(m, listq); } PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { PMAP_LOCK(pmap); (void) pmap_enter_quick_locked(pmap, va, m, prot, NULL); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t *pte; vm_paddr_t pa; vm_page_t free; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { unsigned ptepindex; pd_entry_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) panic("pmap_enter_quick: unexpected mapping into 4MB page"); mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); mpte->wire_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, M_NOWAIT); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpte != NULL) { free = NULL; if (pmap_unwire_pte_hold(pmap, mpte, &free)) { pmap_invalidate_page(pmap, va); pmap_free_zero_pages(free); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m); #ifdef PAE if ((prot & VM_PROT_EXECUTE) == 0) pa |= pg_nx; #endif /* * Now validate mapping with RO protection */ if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); return mpte; } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { vm_page_t p; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT(object->type == OBJT_DEVICE, ("pmap_object_init_pt: non-device object")); if (pseflag && ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) { int i; vm_page_t m[1]; unsigned int ptepindex; int npdes; pd_entry_t ptepa; PMAP_LOCK(pmap); if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)]) goto out; PMAP_UNLOCK(pmap); retry: p = vm_page_lookup(object, pindex); if (p != NULL) { if (vm_page_sleep_if_busy(p, FALSE, "init4p")) goto retry; } else { p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); if (p == NULL) return; m[0] = p; if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { vm_page_lock_queues(); vm_page_free(p); vm_page_unlock_queues(); return; } p = vm_page_lookup(object, pindex); vm_page_lock_queues(); vm_page_wakeup(p); vm_page_unlock_queues(); } ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; p->valid = VM_PAGE_BITS_ALL; PMAP_LOCK(pmap); pmap->pm_stats.resident_count += size >> PAGE_SHIFT; npdes = size >> PDRSHIFT; for(i = 0; i < npdes; i++) { pde_store(&pmap->pm_pdir[ptepindex], ptepa | PG_U | PG_RW | PG_V | PG_PS); ptepa += NBPDR; ptepindex += 1; } pmap_invalidate_all(pmap); out: PMAP_UNLOCK(pmap); } } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) { pt_entry_t *pte; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); pmap_pte_release(pte); PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { vm_page_t free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; vm_page_lock_queues(); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } sched_pin(); for (addr = src_addr; addr < end_addr; addr = pdnxt) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; pd_entry_t srcptepaddr; unsigned ptepindex; if (addr >= UPT_MIN_ADDRESS) panic("pmap_copy: invalid to pmap_copy page tables"); pdnxt = (addr + NBPDR) & ~PDRMASK; ptepindex = addr >> PDRSHIFT; srcptepaddr = src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if (dst_pmap->pm_pdir[ptepindex] == 0) { dst_pmap->pm_pdir[ptepindex] = srcptepaddr & ~PG_W; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; } continue; } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); if (srcmpte->wire_count == 0) panic("pmap_copy: source page table page is unused"); if (pdnxt > end_addr) pdnxt = end_addr; src_pte = vtopte(addr); while (addr < pdnxt) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { dstmpte = pmap_allocpte(dst_pmap, addr, M_NOWAIT); if (dstmpte == NULL) break; dst_pte = pmap_pte_quick(dst_pmap, addr); if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); dst_pmap->pm_stats.resident_count++; } else { free = NULL; if (pmap_unwire_pte_hold( dst_pmap, dstmpte, &free)) { pmap_invalidate_page(dst_pmap, addr); pmap_free_zero_pages(free); } } if (dstmpte->wire_count >= srcmpte->wire_count) break; } addr += PAGE_SIZE; src_pte++; } } sched_unpin(); vm_page_unlock_queues(); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } static __inline void pagezero(void *page) { #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) { #if defined(CPU_ENABLE_SSE) if (cpu_feature & CPUID_SSE2) sse2_pagezero(page); else #endif i686_pagezero(page); } else #endif bzero(page, PAGE_SIZE); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { struct sysmaps *sysmaps; sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP2) panic("pmap_zero_page: CMAP2 busy"); sched_pin(); *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; invlcaddr(sysmaps->CADDR2); pagezero(sysmaps->CADDR2); *sysmaps->CMAP2 = 0; sched_unpin(); mtx_unlock(&sysmaps->lock); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { struct sysmaps *sysmaps; sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP2) panic("pmap_zero_page: CMAP2 busy"); sched_pin(); *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; invlcaddr(sysmaps->CADDR2); if (off == 0 && size == PAGE_SIZE) pagezero(sysmaps->CADDR2); else bzero((char *)sysmaps->CADDR2 + off, size); *sysmaps->CMAP2 = 0; sched_unpin(); mtx_unlock(&sysmaps->lock); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { if (*CMAP3) panic("pmap_zero_page: CMAP3 busy"); sched_pin(); *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; invlcaddr(CADDR3); pagezero(CADDR3); *CMAP3 = 0; sched_unpin(); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { struct sysmaps *sysmaps; sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP1) panic("pmap_copy_page: CMAP1 busy"); if (*sysmaps->CMAP2) panic("pmap_copy_page: CMAP2 busy"); sched_pin(); invlpg((u_int)sysmaps->CADDR1); invlpg((u_int)sysmaps->CADDR2); *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A; *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M; bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); *sysmaps->CMAP1 = 0; *sysmaps->CMAP2 = 0; sched_unpin(); mtx_unlock(&sysmaps->lock); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { pv_entry_t pv; int loops = 0; if (m->flags & PG_FICTITIOUS) return FALSE; mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (PV_PMAP(pv) == pmap) { return TRUE; } loops++; if (loops >= 16) break; } return (FALSE); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pt_entry_t *pte, tpte; vm_page_t m, free = NULL; pv_entry_t pv; struct pv_chunk *pc, *npc; int field, idx; int32_t bit; uint32_t inuse, bitmask; int allfree; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } vm_page_lock_queues(); PMAP_LOCK(pmap); sched_pin(); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = (~(pc->pc_map[field])) & pc_freemask[field]; while (inuse != 0) { bit = bsfl(inuse); bitmask = 1UL << bit; idx = field * 32 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = vtopte(pv->pv_va); tpte = *pte; if (tpte == 0) { printf( "TPTE at %p IS ZERO @ VA %08x\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT(m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pmap->pm_stats.resident_count--; pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if (tpte & PG_M) vm_page_dirty(m); /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; m->md.pv_list_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); pmap_unuse_pt(pmap, pv->pv_va, &free); } } if (allfree) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire(m, 0); vm_page_free(m); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); } } sched_unpin(); pmap_invalidate_all(pmap); vm_page_unlock_queues(); PMAP_UNLOCK(pmap); pmap_free_zero_pages(free); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rv = FALSE; if (m->flags & PG_FICTITIOUS) return (rv); sched_pin(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & PG_M) != 0; PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); if (*pmap_pde(pmap, addr)) { pte = vtopte(addr); rv = *pte == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pt_entry_t oldpte, *pte; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0 || (m->flags & PG_WRITEABLE) == 0) return; sched_pin(); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); retry: oldpte = *pte; if ((oldpte & PG_RW) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_flag_clear(m, PG_WRITEABLE); sched_unpin(); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { pv_entry_t pv, pvf, pvn; pmap_t pmap; pt_entry_t *pte; int rtval = 0; if (m->flags & PG_FICTITIOUS) return (rtval); sched_pin(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pvf = pv; do { pvn = TAILQ_NEXT(pv, pv_list); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_A) != 0) { atomic_clear_int((u_int *)pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); rtval++; if (rtval > 4) pvn = NULL; } PMAP_UNLOCK(pmap); } while ((pv = pvn) != NULL && pv != pvf); } sched_unpin(); return (rtval); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pt_entry_t *pte; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0) return; sched_pin(); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_M) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_M is among the least significant * 32 bits. */ atomic_clear_int((u_int *)pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pt_entry_t *pte; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0) return; sched_pin(); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_A) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_A is among the least significant * 32 bits. */ atomic_clear_int((u_int *)pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); } /* * Miscellaneous support routines follow */ /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { vm_offset_t va, tmpva, offset; offset = pa & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); pa = pa & PG_FRAME; if (pa < KERNLOAD && pa + size <= KERNLOAD) va = KERNBASE + pa; else va = kmem_alloc_nofault(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0; ) { pmap_kenter_attr(tmpva, pa, mode); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, va, tmpva); pmap_invalidate_cache(); return ((void *)(va + offset)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { vm_offset_t base, offset, tmpva; if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) return; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) pmap_kremove(tmpva); pmap_invalidate_range(kernel_pmap, va, tmpva); kmem_free(kernel_map, base, size); } int pmap_change_attr(va, size, mode) vm_offset_t va; vm_size_t size; int mode; { vm_offset_t base, offset, tmpva; pt_entry_t *pte; u_int opte, npte; pd_entry_t *pde; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); /* Only supported on kernel virtual addresses. */ if (base <= VM_MAXUSER_ADDRESS) return (EINVAL); /* 4MB pages and pages that aren't mapped aren't supported. */ for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde & PG_PS) return (EINVAL); if (*pde == 0) return (EINVAL); pte = vtopte(va); if (*pte == 0) return (EINVAL); } /* * Ok, all the pages exist and are 4k, so run through them updating * their cache mode. */ for (tmpva = base; size > 0; ) { pte = vtopte(tmpva); /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT); npte |= pmap_cache_bits(mode, 0); } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); tmpva += PAGE_SIZE; size -= PAGE_SIZE; } /* * Flush CPU caches to make sure any data isn't cached that shouldn't * be, etc. */ pmap_invalidate_range(kernel_pmap, base, tmpva); pmap_invalidate_cache(); return (0); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr) { pt_entry_t *ptep, pte; vm_page_t m; int val = 0; PMAP_LOCK(pmap); ptep = pmap_pte(pmap, addr); pte = (ptep != NULL) ? *ptep : 0; pmap_pte_release(ptep); PMAP_UNLOCK(pmap); if (pte != 0) { vm_paddr_t pa; val = MINCORE_INCORE; if ((pte & PG_MANAGED) == 0) return val; pa = pte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); /* * Modified by us */ if (pte & PG_M) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; else { /* * Modified by someone else */ vm_page_lock_queues(); if (m->dirty || pmap_is_modified(m)) val |= MINCORE_MODIFIED_OTHER; vm_page_unlock_queues(); } /* * Referenced by us */ if (pte & PG_A) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; else { /* * Referenced by someone else */ vm_page_lock_queues(); if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { val |= MINCORE_REFERENCED_OTHER; vm_page_flag_set(m, PG_REFERENCED); } vm_page_unlock_queues(); } } return val; } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; u_int32_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); #if defined(SMP) atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); #else oldpmap->pm_active &= ~1; pmap->pm_active |= 1; #endif #ifdef PAE cr3 = vtophys(pmap->pm_pdpt); #else cr3 = vtophys(pmap->pm_pdir); #endif /* * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_cr3 = cr3; load_cr3(cr3); PCPU_SET(curpmap, pmap); critical_exit(); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { return addr; } addr = (addr + PDRMASK) & ~PDRMASK; return addr; } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPDEPTD; i++) { pd_entry_t *pde; pt_entry_t *pte; vm_offset_t base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for (j = 0; j < NPTEPG; j++) { vm_offset_t va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return npte; } pte = pmap_pte(pmap, va); if (pte && pmap_pte_v(pte)) { pt_entry_t pa; vm_page_t m; pa = *pte; m = PHYS_TO_VM_PAGE(pa & PG_FRAME); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } sx_sunlock(&allproc_lock); return npte; } #endif #if defined(DEBUG) static void pads(pmap_t pm); void pmap_pvdump(vm_offset_t pa); /* print address space of pmap*/ static void pads(pmap_t pm) { int i, j; vm_paddr_t va; pt_entry_t *ptep; if (pm == kernel_pmap) return; for (i = 0; i < NPDEPTD; i++) if (pm->pm_pdir[i]) for (j = 0; j < NPTEPG; j++) { va = (i << PDRSHIFT) + (j << PAGE_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *ptep); }; } void pmap_pvdump(vm_paddr_t pa) { pv_entry_t pv; pmap_t pmap; vm_page_t m; printf("pa %x", pa); m = PHYS_TO_VM_PAGE(pa); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); pads(pmap); } printf(" "); } #endif Index: head/sys/i386/ibcs2/imgact_coff.c =================================================================== --- head/sys/i386/ibcs2/imgact_coff.c (revision 173360) +++ head/sys/i386/ibcs2/imgact_coff.c (revision 173361) @@ -1,495 +1,497 @@ /*- * Copyright (c) 1994 Sean Eric Fagan * Copyright (c) 1994 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_DEPEND(coff, ibcs2, 1, 1, 1); extern struct sysentvec ibcs2_svr3_sysvec; static int coff_load_file(struct thread *td, char *name); static int exec_coff_imgact(struct image_params *imgp); static int load_coff_section(struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot); static int load_coff_section(struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot) { size_t map_len; vm_offset_t map_offset; vm_offset_t map_addr; int error; unsigned char *data_buf = 0; size_t copy_len; map_offset = trunc_page(offset); map_addr = trunc_page((vm_offset_t)vmaddr); if (memsz > filsz) { /* * We have the stupid situation that * the section is longer than it is on file, * which means it has zero-filled areas, and * we have to work for it. Stupid iBCS! */ map_len = trunc_page(offset + filsz) - trunc_page(map_offset); } else { /* * The only stuff we care about is on disk, and we * don't care if we map in more than is really there. */ map_len = round_page(offset + filsz) - trunc_page(map_offset); } DPRINTF(("%s(%d): vm_mmap(&vmspace->vm_map, &0x%08lx, 0x%x, 0x%x, " "VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, 0x%x)\n", __FILE__, __LINE__, map_addr, map_len, prot, map_offset)); if ((error = vm_mmap(&vmspace->vm_map, &map_addr, map_len, prot, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, map_offset)) != 0) return error; if (memsz == filsz) { /* We're done! */ return 0; } /* * Now we have screwball stuff, to accomodate stupid COFF. * We have to map the remaining bit of the file into the kernel's * memory map, allocate some anonymous memory, copy that last * bit into it, and then we're done. *sigh* * For clean-up reasons, we actally map in the file last. */ copy_len = (offset + filsz) - trunc_page(offset + filsz); map_addr = trunc_page((vm_offset_t)vmaddr + filsz); map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr; DPRINTF(("%s(%d): vm_map_find(&vmspace->vm_map, NULL, 0, &0x%08lx,0x%x, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0)\n", __FILE__, __LINE__, map_addr, map_len)); if (map_len != 0) { error = vm_map_find(&vmspace->vm_map, NULL, 0, &map_addr, map_len, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) return error; } if ((error = vm_mmap(kernel_map, (vm_offset_t *) &data_buf, PAGE_SIZE, VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, trunc_page(offset + filsz))) != 0) return error; error = copyout(data_buf, (caddr_t) map_addr, copy_len); if (vm_map_remove(kernel_map, (vm_offset_t) data_buf, (vm_offset_t) data_buf + PAGE_SIZE)) panic("load_coff_section vm_map_remove failed"); return error; } static int coff_load_file(struct thread *td, char *name) { struct proc *p = td->td_proc; struct vmspace *vmspace = p->p_vmspace; int error; struct nameidata nd; struct vnode *vp; struct vattr attr; struct filehdr *fhdr; struct aouthdr *ahdr; struct scnhdr *scns; char *ptr = 0; int nscns; unsigned long text_offset = 0, text_address = 0, text_size = 0; unsigned long data_offset = 0, data_address = 0, data_size = 0; unsigned long bss_size = 0; int i; NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, name, td); error = namei(&nd); if (error) return error; vp = nd.ni_vp; if (vp == NULL) return ENOEXEC; if (vp->v_writecount) { error = ETXTBSY; goto fail; } if ((error = VOP_GETATTR(vp, &attr, td->td_ucred, td)) != 0) goto fail; if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) goto fail; if (attr.va_size == 0) { error = ENOEXEC; goto fail; } if ((error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td)) != 0) goto fail; if ((error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL)) != 0) goto fail; /* * Lose the lock on the vnode. It's no longer needed, and must not * exist for the pagefault paging to work below. */ VOP_UNLOCK(vp, 0, td); if ((error = vm_mmap(kernel_map, (vm_offset_t *) &ptr, PAGE_SIZE, VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0)) != 0) goto unlocked_fail; fhdr = (struct filehdr *)ptr; if (fhdr->f_magic != I386_COFF) { error = ENOEXEC; goto dealloc_and_fail; } nscns = fhdr->f_nscns; if ((nscns * sizeof(struct scnhdr)) > PAGE_SIZE) { /* * XXX -- just fail. I'm so lazy. */ error = ENOEXEC; goto dealloc_and_fail; } ahdr = (struct aouthdr*)(ptr + sizeof(struct filehdr)); scns = (struct scnhdr*)(ptr + sizeof(struct filehdr) + sizeof(struct aouthdr)); for (i = 0; i < nscns; i++) { if (scns[i].s_flags & STYP_NOLOAD) continue; else if (scns[i].s_flags & STYP_TEXT) { text_address = scns[i].s_vaddr; text_size = scns[i].s_size; text_offset = scns[i].s_scnptr; } else if (scns[i].s_flags & STYP_DATA) { data_address = scns[i].s_vaddr; data_size = scns[i].s_size; data_offset = scns[i].s_scnptr; } else if (scns[i].s_flags & STYP_BSS) { bss_size = scns[i].s_size; } } if ((error = load_coff_section(vmspace, vp, text_offset, (caddr_t)(void *)(uintptr_t)text_address, text_size, text_size, VM_PROT_READ | VM_PROT_EXECUTE)) != 0) { goto dealloc_and_fail; } if ((error = load_coff_section(vmspace, vp, data_offset, (caddr_t)(void *)(uintptr_t)data_address, data_size + bss_size, data_size, VM_PROT_ALL)) != 0) { goto dealloc_and_fail; } error = 0; dealloc_and_fail: if (vm_map_remove(kernel_map, (vm_offset_t) ptr, (vm_offset_t) ptr + PAGE_SIZE)) panic("%s vm_map_remove failed", __func__); fail: VOP_UNLOCK(vp, 0, td); unlocked_fail: NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); return error; } static int exec_coff_imgact(imgp) struct image_params *imgp; { const struct filehdr *fhdr = (const struct filehdr*)imgp->image_header; const struct aouthdr *ahdr; const struct scnhdr *scns; int i; struct vmspace *vmspace; int nscns; int error; unsigned long text_offset = 0, text_address = 0, text_size = 0; unsigned long data_offset = 0, data_address = 0, data_size = 0; unsigned long bss_size = 0; caddr_t hole; struct thread *td = curthread; if (fhdr->f_magic != I386_COFF || !(fhdr->f_flags & F_EXEC)) { DPRINTF(("%s(%d): return -1\n", __FILE__, __LINE__)); return -1; } nscns = fhdr->f_nscns; if ((nscns * sizeof(struct scnhdr)) > PAGE_SIZE) { /* * For now, return an error -- need to be able to * read in all of the section structures. */ DPRINTF(("%s(%d): return -1\n", __FILE__, __LINE__)); return -1; } ahdr = (const struct aouthdr*) ((const char*)(imgp->image_header) + sizeof(struct filehdr)); imgp->entry_addr = ahdr->entry; scns = (const struct scnhdr*) ((const char*)(imgp->image_header) + sizeof(struct filehdr) + sizeof(struct aouthdr)); VOP_UNLOCK(imgp->vp, 0, td); - exec_new_vmspace(imgp, &ibcs2_svr3_sysvec); + error = exec_new_vmspace(imgp, &ibcs2_svr3_sysvec); + if (error) + goto fail; vmspace = imgp->proc->p_vmspace; for (i = 0; i < nscns; i++) { DPRINTF(("i = %d, scns[i].s_name = %s, scns[i].s_vaddr = %08lx, " "scns[i].s_scnptr = %d\n", i, scns[i].s_name, scns[i].s_vaddr, scns[i].s_scnptr)); if (scns[i].s_flags & STYP_NOLOAD) { /* * A section that is not loaded, for whatever * reason. It takes precedance over other flag * bits... */ continue; } else if (scns[i].s_flags & STYP_TEXT) { text_address = scns[i].s_vaddr; text_size = scns[i].s_size; text_offset = scns[i].s_scnptr; } else if (scns[i].s_flags & STYP_DATA) { /* .data section */ data_address = scns[i].s_vaddr; data_size = scns[i].s_size; data_offset = scns[i].s_scnptr; } else if (scns[i].s_flags & STYP_BSS) { /* .bss section */ bss_size = scns[i].s_size; } else if (scns[i].s_flags & STYP_LIB) { char *buf = 0; int foff = trunc_page(scns[i].s_scnptr); int off = scns[i].s_scnptr - foff; int len = round_page(scns[i].s_size + PAGE_SIZE); int j; if ((error = vm_mmap(kernel_map, (vm_offset_t *) &buf, len, VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, imgp->vp, foff)) != 0) { error = ENOEXEC; goto fail; } if(scns[i].s_size) { char *libbuf; int emul_path_len = strlen(ibcs2_emul_path); libbuf = malloc(MAXPATHLEN + emul_path_len, M_TEMP, M_WAITOK); strcpy(libbuf, ibcs2_emul_path); for (j = off; j < scns[i].s_size + off;) { long stroff, nextoff; char *libname; nextoff = 4 * *(long *)(buf + j); stroff = 4 * *(long *)(buf + j + sizeof(long)); libname = buf + j + stroff; j += nextoff; DPRINTF(("%s(%d): shared library %s\n", __FILE__, __LINE__, libname)); strlcpy(&libbuf[emul_path_len], libname, MAXPATHLEN); /* XXXKSE only 1:1 in coff */ error = coff_load_file( FIRST_THREAD_IN_PROC(imgp->proc), libbuf); if (error) error = coff_load_file( FIRST_THREAD_IN_PROC(imgp->proc), libname); if (error) break; } free(libbuf, M_TEMP); } if (vm_map_remove(kernel_map, (vm_offset_t) buf, (vm_offset_t) buf + len)) panic("exec_coff_imgact vm_map_remove failed"); if (error) goto fail; } } /* * Map in .text now */ DPRINTF(("%s(%d): load_coff_section(vmspace, " "imgp->vp, %08lx, %08lx, 0x%x, 0x%x, 0x%x)\n", __FILE__, __LINE__, text_offset, text_address, text_size, text_size, VM_PROT_READ | VM_PROT_EXECUTE)); if ((error = load_coff_section(vmspace, imgp->vp, text_offset, (caddr_t)(void *)(uintptr_t)text_address, text_size, text_size, VM_PROT_READ | VM_PROT_EXECUTE)) != 0) { DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__, error)); goto fail; } /* * Map in .data and .bss now */ DPRINTF(("%s(%d): load_coff_section(vmspace, " "imgp->vp, 0x%08lx, 0x%08lx, 0x%x, 0x%x, 0x%x)\n", __FILE__, __LINE__, data_offset, data_address, data_size + bss_size, data_size, VM_PROT_ALL)); if ((error = load_coff_section(vmspace, imgp->vp, data_offset, (caddr_t)(void *)(uintptr_t)data_address, data_size + bss_size, data_size, VM_PROT_ALL)) != 0) { DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__, error)); goto fail; } imgp->interpreted = 0; imgp->proc->p_sysent = &ibcs2_svr3_sysvec; vmspace->vm_tsize = round_page(text_size) >> PAGE_SHIFT; vmspace->vm_dsize = round_page(data_size + bss_size) >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t)(void *)(uintptr_t)text_address; vmspace->vm_daddr = (caddr_t)(void *)(uintptr_t)data_address; hole = (caddr_t)trunc_page((vm_offset_t)vmspace->vm_daddr) + ctob(vmspace->vm_dsize); DPRINTF(("%s(%d): vm_map_find(&vmspace->vm_map, NULL, 0, &0x%08lx, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0)\n", __FILE__, __LINE__, hole)); DPRINTF(("imgact: error = %d\n", error)); error = vm_map_find(&vmspace->vm_map, NULL, 0, (vm_offset_t *) &hole, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); DPRINTF(("IBCS2: start vm_dsize = 0x%x, vm_daddr = 0x%x end = 0x%x\n", ctob(vmspace->vm_dsize), vmspace->vm_daddr, ctob(vmspace->vm_dsize) + vmspace->vm_daddr )); DPRINTF(("%s(%d): returning successfully!\n", __FILE__, __LINE__)); fail: vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); return error; } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw coff_execsw = { exec_coff_imgact, "coff" }; EXEC_SET(coff, coff_execsw); Index: head/sys/i386/linux/imgact_linux.c =================================================================== --- head/sys/i386/linux/imgact_linux.c (revision 173360) +++ head/sys/i386/linux/imgact_linux.c (revision 173361) @@ -1,245 +1,247 @@ /*- * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Based heavily on /sys/kern/imgact_aout.c which is: * Copyright (c) 1993, David Greenman * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int exec_linux_imgact(struct image_params *iparams); static int exec_linux_imgact(struct image_params *imgp) { const struct exec *a_out = (const struct exec *) imgp->image_header; struct vmspace *vmspace; vm_offset_t vmaddr; unsigned long virtual_offset, file_offset; vm_offset_t buffer; unsigned long bss_size; struct thread *td = curthread; int error; if (((a_out->a_magic >> 16) & 0xff) != 0x64) return -1; /* * Set file/virtual offset based on a.out variant. */ switch ((int)(a_out->a_magic & 0xffff)) { case 0413: virtual_offset = 0; file_offset = 1024; break; case 0314: virtual_offset = 4096; file_offset = 0; break; default: return (-1); } bss_size = round_page(a_out->a_bss); #ifdef DEBUG printf("imgact: text: %08lx, data: %08lx, bss: %08lx\n", (u_long)a_out->a_text, (u_long)a_out->a_data, bss_size); #endif /* * Check various fields in header for validity/bounds. */ if (a_out->a_entry < virtual_offset || a_out->a_entry >= virtual_offset + a_out->a_text || a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) return (-1); /* text + data can't exceed file size */ if (a_out->a_data + a_out->a_text > imgp->attr->va_size) return (EFAULT); /* * text/data/bss must not exceed limits */ PROC_LOCK(imgp->proc); if (a_out->a_text > maxtsiz || a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA)) { PROC_UNLOCK(imgp->proc); return (ENOMEM); } PROC_UNLOCK(imgp->proc); VOP_UNLOCK(imgp->vp, 0, td); /* * Destroy old process VM and create a new one (with a new stack) */ - exec_new_vmspace(imgp, &linux_sysvec); + error = exec_new_vmspace(imgp, &linux_sysvec); + if (error) + goto fail; vmspace = imgp->proc->p_vmspace; /* * Check if file_offset page aligned,. * Currently we cannot handle misaligned file offsets, * and so we read in the entire image (what a waste). */ if (file_offset & PAGE_MASK) { #ifdef DEBUG printf("imgact: Non page aligned binary %lu\n", file_offset); #endif /* * Map text+data+bss read/write/execute */ vmaddr = virtual_offset; error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, a_out->a_text + a_out->a_data + bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto fail; error = vm_mmap(kernel_map, &buffer, round_page(a_out->a_text + a_out->a_data + file_offset), VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, imgp->vp, trunc_page(file_offset)); if (error) goto fail; error = copyout((void *)(uintptr_t)(buffer + file_offset), (void *)vmaddr, a_out->a_text + a_out->a_data); vm_map_remove(kernel_map, buffer, buffer + round_page(a_out->a_text + a_out->a_data + file_offset)); if (error) goto fail; /* * remove write enable on the 'text' part */ error = vm_map_protect(&vmspace->vm_map, vmaddr, vmaddr + a_out->a_text, VM_PROT_EXECUTE|VM_PROT_READ, TRUE); if (error) goto fail; } else { #ifdef DEBUG printf("imgact: Page aligned binary %lu\n", file_offset); #endif /* * Map text+data read/execute */ vmaddr = virtual_offset; error = vm_mmap(&vmspace->vm_map, &vmaddr, a_out->a_text + a_out->a_data, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, imgp->vp, file_offset); if (error) goto fail; #ifdef DEBUG printf("imgact: startaddr=%08lx, length=%08lx\n", (u_long)vmaddr, (u_long)a_out->a_text + (u_long)a_out->a_data); #endif /* * allow read/write of data */ error = vm_map_protect(&vmspace->vm_map, vmaddr + a_out->a_text, vmaddr + a_out->a_text + a_out->a_data, VM_PROT_ALL, FALSE); if (error) goto fail; /* * Allocate anon demand-zeroed area for uninitialized data */ if (bss_size != 0) { vmaddr = virtual_offset + a_out->a_text + a_out->a_data; error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto fail; #ifdef DEBUG printf("imgact: bssaddr=%08lx, length=%08lx\n", (u_long)vmaddr, bss_size); #endif } /* Indicate that this file should not be modified */ mp_fixme("Unlocked v_flag access"); imgp->vp->v_vflag |= VV_TEXT; } /* Fill in process VM information */ vmspace->vm_tsize = round_page(a_out->a_text) >> PAGE_SHIFT; vmspace->vm_dsize = round_page(a_out->a_data + bss_size) >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t)(void *)(uintptr_t)virtual_offset; vmspace->vm_daddr = (caddr_t)(void *)(uintptr_t) (virtual_offset + a_out->a_text); /* Fill in image_params */ imgp->interpreted = 0; imgp->entry_addr = a_out->a_entry; imgp->proc->p_sysent = &linux_sysvec; fail: vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); return (error); } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw linux_execsw = { exec_linux_imgact, "linux a.out" }; EXEC_SET(linuxaout, linux_execsw); Index: head/sys/ia64/ia64/machdep.c =================================================================== --- head/sys/ia64/ia64/machdep.c (revision 173360) +++ head/sys/ia64/ia64/machdep.c (revision 173361) @@ -1,1532 +1,1532 @@ /*- * Copyright (c) 2003,2004 Marcel Moolenaar * Copyright (c) 2000,2001 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_kstack_pages.h" #include "opt_msgbuf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include u_int64_t processor_frequency; u_int64_t bus_frequency; u_int64_t itc_frequency; int cold = 1; u_int64_t pa_bootinfo; struct bootinfo bootinfo; struct pcpu pcpu0; extern char kstack[]; vm_offset_t proc0kstack; extern u_int64_t kernel_text[], _end[]; extern u_int64_t ia64_gateway_page[]; extern u_int64_t break_sigtramp[]; extern u_int64_t epc_sigtramp[]; struct fpswa_iface *fpswa_iface; u_int64_t ia64_pal_base; u_int64_t ia64_port_base; char machine[] = MACHINE; SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, ""); static char cpu_model[64]; SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD, cpu_model, 0, "The CPU model name"); static char cpu_family[64]; SYSCTL_STRING(_hw, OID_AUTO, family, CTLFLAG_RD, cpu_family, 0, "The CPU family name"); #ifdef DDB extern vm_offset_t ksym_start, ksym_end; #endif static void cpu_startup(void *); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) struct msgbuf *msgbufp=0; long Maxmem = 0; long realmem = 0; #define PHYSMAP_SIZE (2 * VM_PHYSSEG_MAX) vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2) void mi_startup(void); /* XXX should be in a MI header */ struct kva_md_info kmi; #define Mhz 1000000L #define Ghz (1000L*Mhz) void setPQL2(int *const size, int *const ways); void setPQL2(int *const size, int *const ways) { return; } static void identifycpu(void) { char vendor[17]; char *family_name, *model_name; u_int64_t features, tmp; int number, revision, model, family, archrev; /* * Assumes little-endian. */ *(u_int64_t *) &vendor[0] = ia64_get_cpuid(0); *(u_int64_t *) &vendor[8] = ia64_get_cpuid(1); vendor[16] = '\0'; tmp = ia64_get_cpuid(3); number = (tmp >> 0) & 0xff; revision = (tmp >> 8) & 0xff; model = (tmp >> 16) & 0xff; family = (tmp >> 24) & 0xff; archrev = (tmp >> 32) & 0xff; family_name = model_name = "unknown"; switch (family) { case 0x07: family_name = "Itanium"; model_name = "Merced"; break; case 0x1f: family_name = "Itanium 2"; switch (model) { case 0x00: model_name = "McKinley"; break; case 0x01: /* * Deerfield is a low-voltage variant based on the * Madison core. We need circumstantial evidence * (i.e. the clock frequency) to identify those. * Allow for roughly 1% error margin. */ tmp = processor_frequency >> 7; if ((processor_frequency - tmp) < 1*Ghz && (processor_frequency + tmp) >= 1*Ghz) model_name = "Deerfield"; else model_name = "Madison"; break; case 0x02: model_name = "Madison II"; break; } break; case 0x20: family_name = "Itanium 2"; switch (model) { case 0x00: model_name = "Montecito"; break; } break; } snprintf(cpu_family, sizeof(cpu_family), "%s", family_name); snprintf(cpu_model, sizeof(cpu_model), "%s", model_name); features = ia64_get_cpuid(4); printf("CPU: %s (", model_name); if (processor_frequency) { printf("%ld.%02ld-Mhz ", (processor_frequency + 4999) / Mhz, ((processor_frequency + 4999) / (Mhz/100)) % 100); } printf("%s)\n", family_name); printf(" Origin = \"%s\" Revision = %d\n", vendor, revision); printf(" Features = 0x%b\n", (u_int32_t) features, "\020" "\001LB" /* long branch (brl) instruction. */ "\002SD" /* Spontaneous deferral. */ "\003AO" /* 16-byte atomic operations (ld, st, cmpxchg). */ ); } static void cpu_startup(dummy) void *dummy; { /* * Good {morning,afternoon,evening,night}. */ identifycpu(); /* startrtclock(); */ #ifdef PERFMON perfmon_init(); #endif printf("real memory = %ld (%ld MB)\n", ia64_ptob(Maxmem), ia64_ptob(Maxmem) / 1048576); realmem = Maxmem; /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { long size1 = phys_avail[indx + 1] - phys_avail[indx]; printf("0x%08lx - 0x%08lx, %ld bytes (%ld pages)\n", phys_avail[indx], phys_avail[indx + 1] - 1, size1, size1 >> PAGE_SHIFT); } } vm_ksubmap_init(&kmi); printf("avail memory = %ld (%ld MB)\n", ptoa(cnt.v_free_count), ptoa(cnt.v_free_count) / 1048576); if (fpswa_iface == NULL) printf("Warning: no FPSWA package supplied\n"); else printf("FPSWA Revision = 0x%lx, Entry = %p\n", (long)fpswa_iface->if_rev, (void *)fpswa_iface->if_fpswa); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); /* * Traverse the MADT to discover IOSAPIC and Local SAPIC * information. */ ia64_probe_sapics(); ia64_mca_init(); } void cpu_boot(int howto) { efi_reset_system(); } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { if (pcpu_find(cpu_id) == NULL || rate == NULL) return (EINVAL); *rate = processor_frequency; return (0); } void cpu_halt() { efi_reset_system(); } static void cpu_idle_default(void) { struct ia64_pal_result res; res = ia64_call_pal_static(PAL_HALT_LIGHT, 0, 0, 0); } void cpu_idle() { (*cpu_idle_hook)(); } /* Other subsystems (e.g., ACPI) can hook this later. */ void (*cpu_idle_hook)(void) = cpu_idle_default; void cpu_reset() { cpu_boot(0); } void cpu_switch(struct thread *old, struct thread *new, struct mtx *mtx) { struct pcb *oldpcb, *newpcb; oldpcb = old->td_pcb; #ifdef COMPAT_IA32 ia32_savectx(oldpcb); #endif if (PCPU_GET(fpcurthread) == old) old->td_frame->tf_special.psr |= IA64_PSR_DFH; if (!savectx(oldpcb)) { newpcb = new->td_pcb; oldpcb->pcb_current_pmap = pmap_switch(newpcb->pcb_current_pmap); PCPU_SET(curthread, new); #ifdef COMPAT_IA32 ia32_restorectx(newpcb); #endif if (PCPU_GET(fpcurthread) == new) new->td_frame->tf_special.psr &= ~IA64_PSR_DFH; restorectx(newpcb); /* We should not get here. */ panic("cpu_switch: restorectx() returned"); /* NOTREACHED */ } } void cpu_throw(struct thread *old __unused, struct thread *new) { struct pcb *newpcb; newpcb = new->td_pcb; (void)pmap_switch(newpcb->pcb_current_pmap); PCPU_SET(curthread, new); #ifdef COMPAT_IA32 ia32_restorectx(newpcb); #endif restorectx(newpcb); /* We should not get here. */ panic("cpu_throw: restorectx() returned"); /* NOTREACHED */ } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = cpuid; } void spinlock_enter(void) { struct thread *td; td = curthread; if (td->td_md.md_spinlock_count == 0) td->td_md.md_saved_intr = intr_disable(); td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; td = curthread; critical_exit(); td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) intr_restore(td->td_md.md_saved_intr); } void map_vhpt(uintptr_t vhpt) { pt_entry_t pte; uint64_t psr; pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY | PTE_PL_KERN | PTE_AR_RW; pte |= vhpt & PTE_PPN_MASK; __asm __volatile("ptr.d %0,%1" :: "r"(vhpt), "r"(IA64_ID_PAGE_SHIFT<<2)); __asm __volatile("mov %0=psr" : "=r"(psr)); __asm __volatile("rsm psr.ic|psr.i"); ia64_srlz_i(); ia64_set_ifa(vhpt); ia64_set_itir(IA64_ID_PAGE_SHIFT << 2); ia64_srlz_d(); __asm __volatile("itr.d dtr[%0]=%1" :: "r"(2), "r"(pte)); __asm __volatile("mov psr.l=%0" :: "r" (psr)); ia64_srlz_i(); } void map_pal_code(void) { pt_entry_t pte; uint64_t psr; if (ia64_pal_base == 0) return; pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY | PTE_PL_KERN | PTE_AR_RWX; pte |= ia64_pal_base & PTE_PPN_MASK; __asm __volatile("ptr.d %0,%1; ptr.i %0,%1" :: "r"(IA64_PHYS_TO_RR7(ia64_pal_base)), "r"(IA64_ID_PAGE_SHIFT<<2)); __asm __volatile("mov %0=psr" : "=r"(psr)); __asm __volatile("rsm psr.ic|psr.i"); ia64_srlz_i(); ia64_set_ifa(IA64_PHYS_TO_RR7(ia64_pal_base)); ia64_set_itir(IA64_ID_PAGE_SHIFT << 2); ia64_srlz_d(); __asm __volatile("itr.d dtr[%0]=%1" :: "r"(1), "r"(pte)); ia64_srlz_d(); __asm __volatile("itr.i itr[%0]=%1" :: "r"(1), "r"(pte)); __asm __volatile("mov psr.l=%0" :: "r" (psr)); ia64_srlz_i(); } void map_gateway_page(void) { pt_entry_t pte; uint64_t psr; pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY | PTE_PL_KERN | PTE_AR_X_RX; pte |= (uint64_t)ia64_gateway_page & PTE_PPN_MASK; __asm __volatile("ptr.d %0,%1; ptr.i %0,%1" :: "r"(VM_MAX_ADDRESS), "r"(PAGE_SHIFT << 2)); __asm __volatile("mov %0=psr" : "=r"(psr)); __asm __volatile("rsm psr.ic|psr.i"); ia64_srlz_i(); ia64_set_ifa(VM_MAX_ADDRESS); ia64_set_itir(PAGE_SHIFT << 2); ia64_srlz_d(); __asm __volatile("itr.d dtr[%0]=%1" :: "r"(3), "r"(pte)); ia64_srlz_d(); __asm __volatile("itr.i itr[%0]=%1" :: "r"(3), "r"(pte)); __asm __volatile("mov psr.l=%0" :: "r" (psr)); ia64_srlz_i(); /* Expose the mapping to userland in ar.k5 */ ia64_set_k5(VM_MAX_ADDRESS); } static void calculate_frequencies(void) { struct ia64_sal_result sal; struct ia64_pal_result pal; sal = ia64_sal_entry(SAL_FREQ_BASE, 0, 0, 0, 0, 0, 0, 0); pal = ia64_call_pal_static(PAL_FREQ_RATIOS, 0, 0, 0); if (sal.sal_status == 0 && pal.pal_status == 0) { if (bootverbose) { printf("Platform clock frequency %ld Hz\n", sal.sal_result[0]); printf("Processor ratio %ld/%ld, Bus ratio %ld/%ld, " "ITC ratio %ld/%ld\n", pal.pal_result[0] >> 32, pal.pal_result[0] & ((1L << 32) - 1), pal.pal_result[1] >> 32, pal.pal_result[1] & ((1L << 32) - 1), pal.pal_result[2] >> 32, pal.pal_result[2] & ((1L << 32) - 1)); } processor_frequency = sal.sal_result[0] * (pal.pal_result[0] >> 32) / (pal.pal_result[0] & ((1L << 32) - 1)); bus_frequency = sal.sal_result[0] * (pal.pal_result[1] >> 32) / (pal.pal_result[1] & ((1L << 32) - 1)); itc_frequency = sal.sal_result[0] * (pal.pal_result[2] >> 32) / (pal.pal_result[2] & ((1L << 32) - 1)); } } void ia64_init(void) { int phys_avail_cnt; vm_offset_t kernstart, kernend; vm_offset_t kernstartpfn, kernendpfn, pfn0, pfn1; char *p; struct efi_md *md; int metadata_missing; /* NO OUTPUT ALLOWED UNTIL FURTHER NOTICE */ /* * TODO: Disable interrupts, floating point etc. * Maybe flush cache and tlb */ ia64_set_fpsr(IA64_FPSR_DEFAULT); /* * TODO: Get critical system information (if possible, from the * information provided by the boot program). */ /* * pa_bootinfo is the physical address of the bootinfo block as * passed to us by the loader and set in locore.s. */ bootinfo = *(struct bootinfo *)(IA64_PHYS_TO_RR7(pa_bootinfo)); if (bootinfo.bi_magic != BOOTINFO_MAGIC || bootinfo.bi_version != 1) { bzero(&bootinfo, sizeof(bootinfo)); bootinfo.bi_kernend = (vm_offset_t) round_page(_end); } /* * Look for the I/O ports first - we need them for console * probing. */ for (md = efi_md_first(); md != NULL; md = efi_md_next(md)) { switch (md->md_type) { case EFI_MD_TYPE_IOPORT: ia64_port_base = IA64_PHYS_TO_RR6(md->md_phys); break; case EFI_MD_TYPE_PALCODE: ia64_pal_base = md->md_phys; break; } } metadata_missing = 0; if (bootinfo.bi_modulep) preload_metadata = (caddr_t)bootinfo.bi_modulep; else metadata_missing = 1; if (envmode == 0 && bootinfo.bi_envp) kern_envp = (caddr_t)bootinfo.bi_envp; else kern_envp = static_env; /* * Look at arguments passed to us and compute boothowto. */ boothowto = bootinfo.bi_boothowto; /* * Catch case of boot_verbose set in environment. */ if ((p = getenv("boot_verbose")) != NULL) { if (strcmp(p, "yes") == 0 || strcmp(p, "YES") == 0) { boothowto |= RB_VERBOSE; } freeenv(p); } if (boothowto & RB_VERBOSE) bootverbose = 1; /* * Setup the PCPU data for the bootstrap processor. It is needed * by printf(). Also, since printf() has critical sections, we * need to initialize at least pc_curthread. */ pcpup = &pcpu0; ia64_set_k4((u_int64_t)pcpup); pcpu_init(pcpup, 0, sizeof(pcpu0)); PCPU_SET(curthread, &thread0); /* * Initialize the console before we print anything out. */ cninit(); /* OUTPUT NOW ALLOWED */ if (ia64_pal_base != 0) { ia64_pal_base &= ~IA64_ID_PAGE_MASK; /* * We use a TR to map the first 256M of memory - this might * cover the palcode too. */ if (ia64_pal_base == 0) printf("PAL code mapped by the kernel's TR\n"); } else printf("PAL code not found\n"); /* * Wire things up so we can call the firmware. */ map_pal_code(); efi_boot_minimal(bootinfo.bi_systab); ia64_sal_init(); calculate_frequencies(); /* * Find the beginning and end of the kernel. */ kernstart = trunc_page(kernel_text); #ifdef DDB ksym_start = bootinfo.bi_symtab; ksym_end = bootinfo.bi_esymtab; kernend = (vm_offset_t)round_page(ksym_end); #else kernend = (vm_offset_t)round_page(_end); #endif /* But if the bootstrap tells us otherwise, believe it! */ if (bootinfo.bi_kernend) kernend = round_page(bootinfo.bi_kernend); if (metadata_missing) printf("WARNING: loader(8) metadata is missing!\n"); /* Get FPSWA interface */ fpswa_iface = (bootinfo.bi_fpswa == 0) ? NULL : (struct fpswa_iface *)IA64_PHYS_TO_RR7(bootinfo.bi_fpswa); /* Init basic tunables, including hz */ init_param1(); p = getenv("kernelname"); if (p) { strncpy(kernelname, p, sizeof(kernelname) - 1); freeenv(p); } kernstartpfn = atop(IA64_RR_MASK(kernstart)); kernendpfn = atop(IA64_RR_MASK(kernend)); /* * Size the memory regions and load phys_avail[] with the results. */ /* * Find out how much memory is available, by looking at * the memory descriptors. */ #ifdef DEBUG_MD printf("Memory descriptor count: %d\n", mdcount); #endif phys_avail_cnt = 0; for (md = efi_md_first(); md != NULL; md = efi_md_next(md)) { #ifdef DEBUG_MD printf("MD %p: type %d pa 0x%lx cnt 0x%lx\n", md, md->md_type, md->md_phys, md->md_pages); #endif pfn0 = ia64_btop(round_page(md->md_phys)); pfn1 = ia64_btop(trunc_page(md->md_phys + md->md_pages * 4096)); if (pfn1 <= pfn0) continue; if (md->md_type != EFI_MD_TYPE_FREE) continue; /* * We have a memory descriptor that describes conventional * memory that is for general use. We must determine if the * loader has put the kernel in this region. */ physmem += (pfn1 - pfn0); if (pfn0 <= kernendpfn && kernstartpfn <= pfn1) { /* * Must compute the location of the kernel * within the segment. */ #ifdef DEBUG_MD printf("Descriptor %p contains kernel\n", mp); #endif if (pfn0 < kernstartpfn) { /* * There is a chunk before the kernel. */ #ifdef DEBUG_MD printf("Loading chunk before kernel: " "0x%lx / 0x%lx\n", pfn0, kernstartpfn); #endif phys_avail[phys_avail_cnt] = ia64_ptob(pfn0); phys_avail[phys_avail_cnt+1] = ia64_ptob(kernstartpfn); phys_avail_cnt += 2; } if (kernendpfn < pfn1) { /* * There is a chunk after the kernel. */ #ifdef DEBUG_MD printf("Loading chunk after kernel: " "0x%lx / 0x%lx\n", kernendpfn, pfn1); #endif phys_avail[phys_avail_cnt] = ia64_ptob(kernendpfn); phys_avail[phys_avail_cnt+1] = ia64_ptob(pfn1); phys_avail_cnt += 2; } } else { /* * Just load this cluster as one chunk. */ #ifdef DEBUG_MD printf("Loading descriptor %d: 0x%lx / 0x%lx\n", i, pfn0, pfn1); #endif phys_avail[phys_avail_cnt] = ia64_ptob(pfn0); phys_avail[phys_avail_cnt+1] = ia64_ptob(pfn1); phys_avail_cnt += 2; } } phys_avail[phys_avail_cnt] = 0; Maxmem = physmem; init_param2(physmem); /* * Initialize error message buffer (at end of core). */ msgbufp = (struct msgbuf *)pmap_steal_memory(MSGBUF_SIZE); msgbufinit(msgbufp, MSGBUF_SIZE); - proc_linkup(&proc0, &thread0); + proc_linkup0(&proc0, &thread0); /* * Init mapping for kernel stack for proc 0 */ proc0kstack = (vm_offset_t)kstack; thread0.td_kstack = proc0kstack; thread0.td_kstack_pages = KSTACK_PAGES; mutex_init(); /* * Initialize the rest of proc 0's PCB. * * Set the kernel sp, reserving space for an (empty) trapframe, * and make proc0's trapframe pointer point to it for sanity. * Initialise proc0's backing store to start after u area. */ cpu_thread_setup(&thread0); thread0.td_frame->tf_flags = FRAME_SYSCALL; thread0.td_pcb->pcb_special.sp = (u_int64_t)thread0.td_frame - 16; thread0.td_pcb->pcb_special.bspstore = thread0.td_kstack; /* * Initialize the virtual memory system. */ pmap_bootstrap(); /* * Initialize debuggers, and break into them if appropriate. */ kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter("Boot flags requested debugger\n"); #endif ia64_set_tpr(0); ia64_srlz_d(); /* * Save our current context so that we have a known (maybe even * sane) context as the initial context for new threads that are * forked from us. If any of those threads (including thread0) * does something wrong, we may be lucky and return here where * we're ready for them with a nice panic. */ if (!savectx(thread0.td_pcb)) mi_startup(); /* We should not get here. */ panic("ia64_init: Whooaa there!"); /* NOTREACHED */ } __volatile void * ia64_ioport_address(u_int port) { uint64_t addr; addr = (port > 0xffff) ? IA64_PHYS_TO_RR6((uint64_t)port) : ia64_port_base | ((port & 0xfffc) << 10) | (port & 0xFFF); return ((__volatile void *)addr); } uint64_t ia64_get_hcdp(void) { return (bootinfo.bi_hcdp); } void bzero(void *buf, size_t len) { caddr_t p = buf; while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) { *p++ = 0; len--; } while (len >= sizeof(u_long) * 8) { *(u_long*) p = 0; *((u_long*) p + 1) = 0; *((u_long*) p + 2) = 0; *((u_long*) p + 3) = 0; len -= sizeof(u_long) * 8; *((u_long*) p + 4) = 0; *((u_long*) p + 5) = 0; *((u_long*) p + 6) = 0; *((u_long*) p + 7) = 0; p += sizeof(u_long) * 8; } while (len >= sizeof(u_long)) { *(u_long*) p = 0; len -= sizeof(u_long); p += sizeof(u_long); } while (len) { *p++ = 0; len--; } } void DELAY(int n) { u_int64_t start, end, now; start = ia64_get_itc(); end = start + (itc_frequency * n) / 1000000; /* printf("DELAY from 0x%lx to 0x%lx\n", start, end); */ do { now = ia64_get_itc(); } while (now < end || (now > start && end < start)); } /* * Send an interrupt (signal) to a process. */ void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct proc *p; struct thread *td; struct trapframe *tf; struct sigacts *psp; struct sigframe sf, *sfp; u_int64_t sbs, sp; int oonstack; int sig; u_long code; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; code = ksi->ksi_code; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); tf = td->td_frame; sp = tf->tf_special.sp; oonstack = sigonstack(sp); sbs = 0; /* save user context */ bzero(&sf, sizeof(struct sigframe)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; /* * Allocate and validate space for the signal handler * context. Note that if the stack is in P0 space, the * call to grow() is a nop, and the useracc() check * will fail if the process has not already allocated * the space with a `brk'. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sbs = (u_int64_t)td->td_sigstk.ss_sp; sbs = (sbs + 15) & ~15; sfp = (struct sigframe *)(sbs + td->td_sigstk.ss_size); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sfp = (struct sigframe *)sp; sfp = (struct sigframe *)((u_int64_t)(sfp - 1) & ~15); /* Fill in the siginfo structure for POSIX handlers. */ if (SIGISMEMBER(psp->ps_siginfo, sig)) { sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* * XXX this shouldn't be here after code in trap.c * is fixed */ sf.sf_si.si_addr = (void*)tf->tf_special.ifa; code = (u_int64_t)&sfp->sf_si; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); get_mcontext(td, &sf.sf_uc.uc_mcontext, 0); /* Copy the frame out to userland. */ if (copyout(&sf, sfp, sizeof(sf)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); return; } if ((tf->tf_flags & FRAME_SYSCALL) == 0) { tf->tf_special.psr &= ~IA64_PSR_RI; tf->tf_special.iip = ia64_get_k5() + ((uint64_t)break_sigtramp - (uint64_t)ia64_gateway_page); } else tf->tf_special.iip = ia64_get_k5() + ((uint64_t)epc_sigtramp - (uint64_t)ia64_gateway_page); /* * Setup the trapframe to return to the signal trampoline. We pass * information to the trampoline in the following registers: * * gp new backing store or NULL * r8 signal number * r9 signal code or siginfo pointer * r10 signal handler (function descriptor) */ tf->tf_special.sp = (u_int64_t)sfp - 16; tf->tf_special.gp = sbs; tf->tf_special.bspstore = sf.sf_uc.uc_mcontext.mc_special.bspstore; tf->tf_special.ndirty = 0; tf->tf_special.rnat = sf.sf_uc.uc_mcontext.mc_special.rnat; tf->tf_scratch.gr8 = sig; tf->tf_scratch.gr9 = code; tf->tf_scratch.gr10 = (u_int64_t)catcher; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. * * MPSAFE */ int sigreturn(struct thread *td, struct sigreturn_args /* { ucontext_t *sigcntxp; } */ *uap) { ucontext_t uc; struct trapframe *tf; struct proc *p; struct pcb *pcb; tf = td->td_frame; p = td->td_proc; pcb = td->td_pcb; /* * Fetch the entire context structure at once for speed. * We don't use a normal argument to simplify RSE handling. */ if (copyin(uap->sigcntxp, (caddr_t)&uc, sizeof(uc))) return (EFAULT); set_mcontext(td, &uc.uc_mcontext); PROC_LOCK(p); #if defined(COMPAT_43) if (sigonstack(tf->tf_special.sp)) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif td->td_sigmask = uc.uc_sigmask; SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); return (EJUSTRETURN); } #ifdef COMPAT_FREEBSD4 int freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) { return sigreturn(td, (struct sigreturn_args *)uap); } #endif /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_special = tf->tf_special; pcb->pcb_special.__spare = ~0UL; /* XXX see unwind.c */ save_callee_saved(&pcb->pcb_preserved); save_callee_saved_fp(&pcb->pcb_preserved_fp); } int ia64_flush_dirty(struct thread *td, struct _special *r) { struct iovec iov; struct uio uio; uint64_t bspst, kstk, rnat; int error; if (r->ndirty == 0) return (0); kstk = td->td_kstack + (r->bspstore & 0x1ffUL); if (td == curthread) { __asm __volatile("mov ar.rsc=0;;"); __asm __volatile("mov %0=ar.bspstore" : "=r"(bspst)); /* Make sure we have all the user registers written out. */ if (bspst - kstk < r->ndirty) { __asm __volatile("flushrs;;"); __asm __volatile("mov %0=ar.bspstore" : "=r"(bspst)); } __asm __volatile("mov %0=ar.rnat;;" : "=r"(rnat)); __asm __volatile("mov ar.rsc=3"); error = copyout((void*)kstk, (void*)r->bspstore, r->ndirty); kstk += r->ndirty; r->rnat = (bspst > kstk && (bspst & 0x1ffL) < (kstk & 0x1ffL)) ? *(uint64_t*)(kstk | 0x1f8L) : rnat; } else { PHOLD(td->td_proc); iov.iov_base = (void*)(uintptr_t)kstk; iov.iov_len = r->ndirty; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = r->bspstore; uio.uio_resid = r->ndirty; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_WRITE; uio.uio_td = td; error = proc_rwmem(td->td_proc, &uio); /* * XXX proc_rwmem() doesn't currently return ENOSPC, * so I think it can bogusly return 0. Neither do * we allow short writes. */ if (uio.uio_resid != 0 && error == 0) error = ENOSPC; PRELE(td->td_proc); } r->bspstore += r->ndirty; r->ndirty = 0; return (error); } int get_mcontext(struct thread *td, mcontext_t *mc, int flags) { struct trapframe *tf; int error; tf = td->td_frame; bzero(mc, sizeof(*mc)); mc->mc_special = tf->tf_special; error = ia64_flush_dirty(td, &mc->mc_special); if (tf->tf_flags & FRAME_SYSCALL) { mc->mc_flags |= _MC_FLAGS_SYSCALL_CONTEXT; mc->mc_scratch = tf->tf_scratch; if (flags & GET_MC_CLEAR_RET) { mc->mc_scratch.gr8 = 0; mc->mc_scratch.gr9 = 0; mc->mc_scratch.gr10 = 0; mc->mc_scratch.gr11 = 0; } } else { mc->mc_flags |= _MC_FLAGS_ASYNC_CONTEXT; mc->mc_scratch = tf->tf_scratch; mc->mc_scratch_fp = tf->tf_scratch_fp; /* * XXX If the thread never used the high FP registers, we * probably shouldn't waste time saving them. */ ia64_highfp_save(td); mc->mc_flags |= _MC_FLAGS_HIGHFP_VALID; mc->mc_high_fp = td->td_pcb->pcb_high_fp; } save_callee_saved(&mc->mc_preserved); save_callee_saved_fp(&mc->mc_preserved_fp); return (error); } int set_mcontext(struct thread *td, const mcontext_t *mc) { struct _special s; struct trapframe *tf; uint64_t psrmask; tf = td->td_frame; KASSERT((tf->tf_special.ndirty & ~PAGE_MASK) == 0, ("Whoa there! We have more than 8KB of dirty registers!")); s = mc->mc_special; /* * Only copy the user mask and the restart instruction bit from * the new context. */ psrmask = IA64_PSR_BE | IA64_PSR_UP | IA64_PSR_AC | IA64_PSR_MFL | IA64_PSR_MFH | IA64_PSR_RI; s.psr = (tf->tf_special.psr & ~psrmask) | (s.psr & psrmask); /* We don't have any dirty registers of the new context. */ s.ndirty = 0; if (mc->mc_flags & _MC_FLAGS_ASYNC_CONTEXT) { /* * We can get an async context passed to us while we * entered the kernel through a syscall: sigreturn(2) * and kse_switchin(2) both take contexts that could * previously be the result of a trap or interrupt. * Hence, we cannot assert that the trapframe is not * a syscall frame, but we can assert that it's at * least an expected syscall. */ if (tf->tf_flags & FRAME_SYSCALL) { KASSERT(tf->tf_scratch.gr15 == SYS_sigreturn || tf->tf_scratch.gr15 == SYS_kse_switchin, ("foo")); tf->tf_flags &= ~FRAME_SYSCALL; } tf->tf_scratch = mc->mc_scratch; tf->tf_scratch_fp = mc->mc_scratch_fp; if (mc->mc_flags & _MC_FLAGS_HIGHFP_VALID) td->td_pcb->pcb_high_fp = mc->mc_high_fp; } else { KASSERT((tf->tf_flags & FRAME_SYSCALL) != 0, ("foo")); if ((mc->mc_flags & _MC_FLAGS_SYSCALL_CONTEXT) == 0) { s.cfm = tf->tf_special.cfm; s.iip = tf->tf_special.iip; tf->tf_scratch.gr15 = 0; /* Clear syscall nr. */ } else tf->tf_scratch = mc->mc_scratch; } tf->tf_special = s; restore_callee_saved(&mc->mc_preserved); restore_callee_saved_fp(&mc->mc_preserved_fp); if (mc->mc_flags & _MC_FLAGS_KSE_SET_MBOX) suword((caddr_t)mc->mc_special.ifa, mc->mc_special.isr); return (0); } /* * Clear registers on exec. */ void exec_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings) { struct trapframe *tf; uint64_t *ksttop, *kst; tf = td->td_frame; ksttop = (uint64_t*)(td->td_kstack + tf->tf_special.ndirty + (tf->tf_special.bspstore & 0x1ffUL)); /* * We can ignore up to 8KB of dirty registers by masking off the * lower 13 bits in exception_restore() or epc_syscall(). This * should be enough for a couple of years, but if there are more * than 8KB of dirty registers, we lose track of the bottom of * the kernel stack. The solution is to copy the active part of * the kernel stack down 1 page (or 2, but not more than that) * so that we always have less than 8KB of dirty registers. */ KASSERT((tf->tf_special.ndirty & ~PAGE_MASK) == 0, ("Whoa there! We have more than 8KB of dirty registers!")); bzero(&tf->tf_special, sizeof(tf->tf_special)); if ((tf->tf_flags & FRAME_SYSCALL) == 0) { /* break syscalls. */ bzero(&tf->tf_scratch, sizeof(tf->tf_scratch)); bzero(&tf->tf_scratch_fp, sizeof(tf->tf_scratch_fp)); tf->tf_special.cfm = (1UL<<63) | (3UL<<7) | 3UL; tf->tf_special.bspstore = IA64_BACKINGSTORE; /* * Copy the arguments onto the kernel register stack so that * they get loaded by the loadrs instruction. Skip over the * NaT collection points. */ kst = ksttop - 1; if (((uintptr_t)kst & 0x1ff) == 0x1f8) *kst-- = 0; *kst-- = 0; if (((uintptr_t)kst & 0x1ff) == 0x1f8) *kst-- = 0; *kst-- = ps_strings; if (((uintptr_t)kst & 0x1ff) == 0x1f8) *kst-- = 0; *kst = stack; tf->tf_special.ndirty = (ksttop - kst) << 3; } else { /* epc syscalls (default). */ tf->tf_special.cfm = (3UL<<62) | (3UL<<7) | 3UL; tf->tf_special.bspstore = IA64_BACKINGSTORE + 24; /* * Write values for out0, out1 and out2 to the user's backing * store and arrange for them to be restored into the user's * initial register frame. * Assumes that (bspstore & 0x1f8) < 0x1e0. */ suword((caddr_t)tf->tf_special.bspstore - 24, stack); suword((caddr_t)tf->tf_special.bspstore - 16, ps_strings); suword((caddr_t)tf->tf_special.bspstore - 8, 0); } tf->tf_special.iip = entry; tf->tf_special.sp = (stack & ~15) - 16; tf->tf_special.rsc = 0xf; tf->tf_special.fpsr = IA64_FPSR_DEFAULT; tf->tf_special.psr = IA64_PSR_IC | IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_RT | IA64_PSR_DFH | IA64_PSR_BN | IA64_PSR_CPL_USER; } int ptrace_set_pc(struct thread *td, unsigned long addr) { uint64_t slot; switch (addr & 0xFUL) { case 0: slot = IA64_PSR_RI_0; break; case 1: /* XXX we need to deal with MLX bundles here */ slot = IA64_PSR_RI_1; break; case 2: slot = IA64_PSR_RI_2; break; default: return (EINVAL); } td->td_frame->tf_special.iip = addr & ~0x0FULL; td->td_frame->tf_special.psr = (td->td_frame->tf_special.psr & ~IA64_PSR_RI) | slot; return (0); } int ptrace_single_step(struct thread *td) { struct trapframe *tf; /* * There's no way to set single stepping when we're leaving the * kernel through the EPC syscall path. The way we solve this is * by enabling the lower-privilege trap so that we re-enter the * kernel as soon as the privilege level changes. See trap.c for * how we proceed from there. */ tf = td->td_frame; if (tf->tf_flags & FRAME_SYSCALL) tf->tf_special.psr |= IA64_PSR_LP; else tf->tf_special.psr |= IA64_PSR_SS; return (0); } int ptrace_clear_single_step(struct thread *td) { struct trapframe *tf; /* * Clear any and all status bits we may use to implement single * stepping. */ tf = td->td_frame; tf->tf_special.psr &= ~IA64_PSR_SS; tf->tf_special.psr &= ~IA64_PSR_LP; tf->tf_special.psr &= ~IA64_PSR_TB; return (0); } int fill_regs(struct thread *td, struct reg *regs) { struct trapframe *tf; tf = td->td_frame; regs->r_special = tf->tf_special; regs->r_scratch = tf->tf_scratch; save_callee_saved(®s->r_preserved); return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *tf; int error; tf = td->td_frame; error = ia64_flush_dirty(td, &tf->tf_special); if (!error) { tf->tf_special = regs->r_special; tf->tf_special.bspstore += tf->tf_special.ndirty; tf->tf_special.ndirty = 0; tf->tf_scratch = regs->r_scratch; restore_callee_saved(®s->r_preserved); } return (error); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { return (ENOSYS); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { return (ENOSYS); } int fill_fpregs(struct thread *td, struct fpreg *fpregs) { struct trapframe *frame = td->td_frame; struct pcb *pcb = td->td_pcb; /* Save the high FP registers. */ ia64_highfp_save(td); fpregs->fpr_scratch = frame->tf_scratch_fp; save_callee_saved_fp(&fpregs->fpr_preserved); fpregs->fpr_high = pcb->pcb_high_fp; return (0); } int set_fpregs(struct thread *td, struct fpreg *fpregs) { struct trapframe *frame = td->td_frame; struct pcb *pcb = td->td_pcb; /* Throw away the high FP registers (should be redundant). */ ia64_highfp_drop(td); frame->tf_scratch_fp = fpregs->fpr_scratch; restore_callee_saved_fp(&fpregs->fpr_preserved); pcb->pcb_high_fp = fpregs->fpr_high; return (0); } /* * High FP register functions. */ int ia64_highfp_drop(struct thread *td) { struct pcb *pcb; struct pcpu *cpu; struct thread *thr; mtx_lock_spin(&td->td_md.md_highfp_mtx); pcb = td->td_pcb; cpu = pcb->pcb_fpcpu; if (cpu == NULL) { mtx_unlock_spin(&td->td_md.md_highfp_mtx); return (0); } pcb->pcb_fpcpu = NULL; thr = cpu->pc_fpcurthread; cpu->pc_fpcurthread = NULL; mtx_unlock_spin(&td->td_md.md_highfp_mtx); /* Post-mortem sanity checking. */ KASSERT(thr == td, ("Inconsistent high FP state")); return (1); } int ia64_highfp_save(struct thread *td) { struct pcb *pcb; struct pcpu *cpu; struct thread *thr; /* Don't save if the high FP registers weren't modified. */ if ((td->td_frame->tf_special.psr & IA64_PSR_MFH) == 0) return (ia64_highfp_drop(td)); mtx_lock_spin(&td->td_md.md_highfp_mtx); pcb = td->td_pcb; cpu = pcb->pcb_fpcpu; if (cpu == NULL) { mtx_unlock_spin(&td->td_md.md_highfp_mtx); return (0); } #ifdef SMP if (td == curthread) sched_pin(); if (cpu != pcpup) { mtx_unlock_spin(&td->td_md.md_highfp_mtx); ipi_send(cpu, IPI_HIGH_FP); if (td == curthread) sched_unpin(); while (pcb->pcb_fpcpu == cpu) DELAY(100); return (1); } else { save_high_fp(&pcb->pcb_high_fp); if (td == curthread) sched_unpin(); } #else save_high_fp(&pcb->pcb_high_fp); #endif pcb->pcb_fpcpu = NULL; thr = cpu->pc_fpcurthread; cpu->pc_fpcurthread = NULL; mtx_unlock_spin(&td->td_md.md_highfp_mtx); /* Post-mortem sanity cxhecking. */ KASSERT(thr == td, ("Inconsistent high FP state")); return (1); } int sysbeep(int pitch, int period) { return (ENODEV); } Index: head/sys/ia64/ia64/pmap.c =================================================================== --- head/sys/ia64/ia64/pmap.c (revision 173360) +++ head/sys/ia64/ia64/pmap.c (revision 173361) @@ -1,2408 +1,2409 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 1998,2000 Doug Rabson * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * from: i386 Id: pmap.c,v 1.193 1998/04/19 15:22:48 bde Exp * with some ideas from NetBSD's alpha pmap */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ /* * Following the Linux model, region IDs are allocated in groups of * eight so that a single region ID can be used for as many RRs as we * want by encoding the RR number into the low bits of the ID. * * We reserve region ID 0 for the kernel and allocate the remaining * IDs for user pmaps. * * Region 0..4 * User virtually mapped * * Region 5 * Kernel virtually mapped * * Region 6 * Kernel physically mapped uncacheable * * Region 7 * Kernel physically mapped cacheable */ /* XXX move to a header. */ extern uint64_t ia64_gateway_page[]; MALLOC_DEFINE(M_PMAP, "PMAP", "PMAP Structures"); #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if !defined(DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif #define pmap_accessed(lpte) ((lpte)->pte & PTE_ACCESSED) #define pmap_dirty(lpte) ((lpte)->pte & PTE_DIRTY) #define pmap_managed(lpte) ((lpte)->pte & PTE_MANAGED) #define pmap_ppn(lpte) ((lpte)->pte & PTE_PPN_MASK) #define pmap_present(lpte) ((lpte)->pte & PTE_PRESENT) #define pmap_prot(lpte) (((lpte)->pte & PTE_PROT_MASK) >> 56) #define pmap_wired(lpte) ((lpte)->pte & PTE_WIRED) #define pmap_clear_accessed(lpte) (lpte)->pte &= ~PTE_ACCESSED #define pmap_clear_dirty(lpte) (lpte)->pte &= ~PTE_DIRTY #define pmap_clear_present(lpte) (lpte)->pte &= ~PTE_PRESENT #define pmap_clear_wired(lpte) (lpte)->pte &= ~PTE_WIRED #define pmap_set_wired(lpte) (lpte)->pte |= PTE_WIRED /* * The VHPT bucket head structure. */ struct ia64_bucket { uint64_t chain; struct mtx mutex; u_int length; }; /* * Statically allocated kernel pmap */ struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ /* * Kernel virtual memory management. */ static int nkpt; struct ia64_lpte ***ia64_kptdir; #define KPTE_DIR0_INDEX(va) \ (((va) >> (3*PAGE_SHIFT-8)) & ((1<<(PAGE_SHIFT-3))-1)) #define KPTE_DIR1_INDEX(va) \ (((va) >> (2*PAGE_SHIFT-5)) & ((1<<(PAGE_SHIFT-3))-1)) #define KPTE_PTE_INDEX(va) \ (((va) >> PAGE_SHIFT) & ((1<<(PAGE_SHIFT-5))-1)) #define NKPTEPG (PAGE_SIZE / sizeof(struct ia64_lpte)) vm_offset_t kernel_vm_end; /* Values for ptc.e. XXX values for SKI. */ static uint64_t pmap_ptc_e_base = 0x100000000; static uint64_t pmap_ptc_e_count1 = 3; static uint64_t pmap_ptc_e_count2 = 2; static uint64_t pmap_ptc_e_stride1 = 0x2000; static uint64_t pmap_ptc_e_stride2 = 0x100000000; struct mtx pmap_ptcmutex; /* * Data for the RID allocator */ static int pmap_ridcount; static int pmap_rididx; static int pmap_ridmapsz; static int pmap_ridmax; static uint64_t *pmap_ridmap; struct mtx pmap_ridmutex; /* * Data for the pv entry allocation mechanism */ static uma_zone_t pvzone; static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; /* * Data for allocating PTEs for user processes. */ static uma_zone_t ptezone; /* * Virtual Hash Page Table (VHPT) data. */ /* SYSCTL_DECL(_machdep); */ SYSCTL_NODE(_machdep, OID_AUTO, vhpt, CTLFLAG_RD, 0, ""); struct ia64_bucket *pmap_vhpt_bucket; int pmap_vhpt_nbuckets; SYSCTL_INT(_machdep_vhpt, OID_AUTO, nbuckets, CTLFLAG_RD, &pmap_vhpt_nbuckets, 0, ""); uint64_t pmap_vhpt_base[MAXCPU]; int pmap_vhpt_log2size = 0; TUNABLE_INT("machdep.vhpt.log2size", &pmap_vhpt_log2size); SYSCTL_INT(_machdep_vhpt, OID_AUTO, log2size, CTLFLAG_RD, &pmap_vhpt_log2size, 0, ""); static int pmap_vhpt_inserts; SYSCTL_INT(_machdep_vhpt, OID_AUTO, inserts, CTLFLAG_RD, &pmap_vhpt_inserts, 0, ""); static int pmap_vhpt_population(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_machdep_vhpt, OID_AUTO, population, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, pmap_vhpt_population, "I", ""); static struct ia64_lpte *pmap_find_vhpt(vm_offset_t va); static PMAP_INLINE void free_pv_entry(pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t locked_pmap); static void pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot); static void pmap_invalidate_all(pmap_t pmap); static int pmap_remove_pte(pmap_t pmap, struct ia64_lpte *pte, vm_offset_t va, pv_entry_t pv, int freepte); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); vm_offset_t pmap_steal_memory(vm_size_t size) { vm_size_t bank_size; vm_offset_t pa, va; size = round_page(size); bank_size = phys_avail[1] - phys_avail[0]; while (size > bank_size) { int i; for (i = 0; phys_avail[i+2]; i+= 2) { phys_avail[i] = phys_avail[i+2]; phys_avail[i+1] = phys_avail[i+3]; } phys_avail[i] = 0; phys_avail[i+1] = 0; if (!phys_avail[0]) panic("pmap_steal_memory: out of memory"); bank_size = phys_avail[1] - phys_avail[0]; } pa = phys_avail[0]; phys_avail[0] += size; va = IA64_PHYS_TO_RR7(pa); bzero((caddr_t) va, size); return va; } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap() { struct ia64_pal_result res; struct ia64_lpte *pte; vm_offset_t base, limit; size_t size; int i, j, count, ridbits; /* * Query the PAL Code to find the loop parameters for the * ptc.e instruction. */ res = ia64_call_pal_static(PAL_PTCE_INFO, 0, 0, 0); if (res.pal_status != 0) panic("Can't configure ptc.e parameters"); pmap_ptc_e_base = res.pal_result[0]; pmap_ptc_e_count1 = res.pal_result[1] >> 32; pmap_ptc_e_count2 = res.pal_result[1] & ((1L<<32) - 1); pmap_ptc_e_stride1 = res.pal_result[2] >> 32; pmap_ptc_e_stride2 = res.pal_result[2] & ((1L<<32) - 1); if (bootverbose) printf("ptc.e base=0x%lx, count1=%ld, count2=%ld, " "stride1=0x%lx, stride2=0x%lx\n", pmap_ptc_e_base, pmap_ptc_e_count1, pmap_ptc_e_count2, pmap_ptc_e_stride1, pmap_ptc_e_stride2); mtx_init(&pmap_ptcmutex, "Global PTC lock", NULL, MTX_SPIN); /* * Setup RIDs. RIDs 0..7 are reserved for the kernel. * * We currently need at least 19 bits in the RID because PID_MAX * can only be encoded in 17 bits and we need RIDs for 5 regions * per process. With PID_MAX equalling 99999 this means that we * need to be able to encode 499995 (=5*PID_MAX). * The Itanium processor only has 18 bits and the architected * minimum is exactly that. So, we cannot use a PID based scheme * in those cases. Enter pmap_ridmap... * We should avoid the map when running on a processor that has * implemented enough bits. This means that we should pass the * process/thread ID to pmap. This we currently don't do, so we * use the map anyway. However, we don't want to allocate a map * that is large enough to cover the range dictated by the number * of bits in the RID, because that may result in a RID map of * 2MB in size for a 24-bit RID. A 64KB map is enough. * The bottomline: we create a 32KB map when the processor only * implements 18 bits (or when we can't figure it out). Otherwise * we create a 64KB map. */ res = ia64_call_pal_static(PAL_VM_SUMMARY, 0, 0, 0); if (res.pal_status != 0) { if (bootverbose) printf("Can't read VM Summary - assuming 18 Region ID bits\n"); ridbits = 18; /* guaranteed minimum */ } else { ridbits = (res.pal_result[1] >> 8) & 0xff; if (bootverbose) printf("Processor supports %d Region ID bits\n", ridbits); } if (ridbits > 19) ridbits = 19; pmap_ridmax = (1 << ridbits); pmap_ridmapsz = pmap_ridmax / 64; pmap_ridmap = (uint64_t *)pmap_steal_memory(pmap_ridmax / 8); pmap_ridmap[0] |= 0xff; pmap_rididx = 0; pmap_ridcount = 8; mtx_init(&pmap_ridmutex, "RID allocator lock", NULL, MTX_DEF); /* * Allocate some memory for initial kernel 'page tables'. */ ia64_kptdir = (void *)pmap_steal_memory(PAGE_SIZE); nkpt = 0; kernel_vm_end = VM_MIN_KERNEL_ADDRESS - VM_GATEWAY_SIZE; for (i = 0; phys_avail[i+2]; i+= 2) ; count = i+2; /* * Figure out a useful size for the VHPT, based on the size of * physical memory and try to locate a region which is large * enough to contain the VHPT (which must be a power of two in * size and aligned to a natural boundary). * We silently bump up the VHPT size to the minimum size if the * user has set the tunable too small. Likewise, the VHPT size * is silently capped to the maximum allowed. */ TUNABLE_INT_FETCH("machdep.vhpt.log2size", &pmap_vhpt_log2size); if (pmap_vhpt_log2size == 0) { pmap_vhpt_log2size = 15; size = 1UL << pmap_vhpt_log2size; while (size < Maxmem * 32) { pmap_vhpt_log2size++; size <<= 1; } } else if (pmap_vhpt_log2size < 15) pmap_vhpt_log2size = 15; if (pmap_vhpt_log2size > 61) pmap_vhpt_log2size = 61; pmap_vhpt_base[0] = 0; base = limit = 0; size = 1UL << pmap_vhpt_log2size; while (pmap_vhpt_base[0] == 0) { if (bootverbose) printf("Trying VHPT size 0x%lx\n", size); for (i = 0; i < count; i += 2) { base = (phys_avail[i] + size - 1) & ~(size - 1); limit = base + MAXCPU * size; if (limit <= phys_avail[i+1]) /* * VHPT can fit in this region */ break; } if (!phys_avail[i]) { /* Can't fit, try next smaller size. */ pmap_vhpt_log2size--; size >>= 1; } else pmap_vhpt_base[0] = IA64_PHYS_TO_RR7(base); } if (pmap_vhpt_log2size < 15) panic("Can't find space for VHPT"); if (bootverbose) printf("Putting VHPT at 0x%lx\n", base); if (base != phys_avail[i]) { /* Split this region. */ if (bootverbose) printf("Splitting [%p-%p]\n", (void *)phys_avail[i], (void *)phys_avail[i+1]); for (j = count; j > i; j -= 2) { phys_avail[j] = phys_avail[j-2]; phys_avail[j+1] = phys_avail[j-2+1]; } phys_avail[i+1] = base; phys_avail[i+2] = limit; } else phys_avail[i] = limit; pmap_vhpt_nbuckets = size / sizeof(struct ia64_lpte); pmap_vhpt_bucket = (void *)pmap_steal_memory(pmap_vhpt_nbuckets * sizeof(struct ia64_bucket)); pte = (struct ia64_lpte *)pmap_vhpt_base[0]; for (i = 0; i < pmap_vhpt_nbuckets; i++) { pte[i].pte = 0; pte[i].itir = 0; pte[i].tag = 1UL << 63; /* Invalid tag */ pte[i].chain = (uintptr_t)(pmap_vhpt_bucket + i); /* Stolen memory is zeroed! */ mtx_init(&pmap_vhpt_bucket[i].mutex, "VHPT bucket lock", NULL, MTX_SPIN); } for (i = 1; i < MAXCPU; i++) { pmap_vhpt_base[i] = pmap_vhpt_base[i - 1] + size; bcopy((void *)pmap_vhpt_base[i - 1], (void *)pmap_vhpt_base[i], size); } map_vhpt(pmap_vhpt_base[0]); ia64_set_pta(pmap_vhpt_base[0] + (1 << 8) + (pmap_vhpt_log2size << 2) + 1); ia64_srlz_i(); virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); for (i = 0; i < 5; i++) kernel_pmap->pm_rid[i] = 0; kernel_pmap->pm_active = 1; TAILQ_INIT(&kernel_pmap->pm_pvlist); PCPU_SET(current_pmap, kernel_pmap); /* * Region 5 is mapped via the vhpt. */ ia64_set_rr(IA64_RR_BASE(5), (5 << 8) | (PAGE_SHIFT << 2) | 1); /* * Region 6 is direct mapped UC and region 7 is direct mapped * WC. The details of this is controlled by the Alt {I,D}TLB * handlers. Here we just make sure that they have the largest * possible page size to minimise TLB usage. */ ia64_set_rr(IA64_RR_BASE(6), (6 << 8) | (IA64_ID_PAGE_SHIFT << 2)); ia64_set_rr(IA64_RR_BASE(7), (7 << 8) | (IA64_ID_PAGE_SHIFT << 2)); ia64_srlz_d(); /* * Clear out any random TLB entries left over from booting. */ pmap_invalidate_all(kernel_pmap); map_gateway_page(); } static int pmap_vhpt_population(SYSCTL_HANDLER_ARGS) { int count, error, i; count = 0; for (i = 0; i < pmap_vhpt_nbuckets; i++) count += pmap_vhpt_bucket[i].length; error = SYSCTL_OUT(req, &count, sizeof(count)); return (error); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_list_count = 0; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { int shpgperproc = PMAP_SHPGPERPROC; /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_high_water = 9 * (pv_entry_max / 10); ptezone = uma_zcreate("PT ENTRY", sizeof (struct ia64_lpte), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE); } /*************************************************** * Manipulate TLBs for a pmap ***************************************************/ #if 0 static __inline void pmap_invalidate_page_locally(void *arg) { vm_offset_t va = (uintptr_t)arg; struct ia64_lpte *pte; pte = (struct ia64_lpte *)ia64_thash(va); if (pte->tag == ia64_ttag(va)) pte->tag = 1UL << 63; ia64_ptc_l(va, PAGE_SHIFT << 2); } #ifdef SMP static void pmap_invalidate_page_1(void *arg) { void **args = arg; pmap_t oldpmap; critical_enter(); oldpmap = pmap_switch(args[0]); pmap_invalidate_page_locally(args[1]); pmap_switch(oldpmap); critical_exit(); } #endif static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)), ("invalidating TLB for non-current pmap")); #ifdef SMP if (mp_ncpus > 1) { void *args[2]; args[0] = pmap; args[1] = (void *)va; smp_rendezvous(NULL, pmap_invalidate_page_1, NULL, args); } else #endif pmap_invalidate_page_locally((void *)va); } #endif /* 0 */ static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { struct ia64_lpte *pte; int i, vhpt_ofs; KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)), ("invalidating TLB for non-current pmap")); vhpt_ofs = ia64_thash(va) - pmap_vhpt_base[PCPU_GET(cpuid)]; critical_enter(); for (i = 0; i < MAXCPU; i++) { pte = (struct ia64_lpte *)(pmap_vhpt_base[i] + vhpt_ofs); if (pte->tag == ia64_ttag(va)) pte->tag = 1UL << 63; } critical_exit(); mtx_lock_spin(&pmap_ptcmutex); ia64_ptc_ga(va, PAGE_SHIFT << 2); mtx_unlock_spin(&pmap_ptcmutex); } static void pmap_invalidate_all_1(void *arg) { uint64_t addr; int i, j; critical_enter(); addr = pmap_ptc_e_base; for (i = 0; i < pmap_ptc_e_count1; i++) { for (j = 0; j < pmap_ptc_e_count2; j++) { ia64_ptc_e(addr); addr += pmap_ptc_e_stride2; } addr += pmap_ptc_e_stride1; } critical_exit(); } static void pmap_invalidate_all(pmap_t pmap) { KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)), ("invalidating TLB for non-current pmap")); #ifdef SMP if (mp_ncpus > 1) smp_rendezvous(NULL, pmap_invalidate_all_1, NULL, NULL); else #endif pmap_invalidate_all_1(NULL); } static uint32_t pmap_allocate_rid(void) { uint64_t bit, bits; int rid; mtx_lock(&pmap_ridmutex); if (pmap_ridcount == pmap_ridmax) panic("pmap_allocate_rid: All Region IDs used"); /* Find an index with a free bit. */ while ((bits = pmap_ridmap[pmap_rididx]) == ~0UL) { pmap_rididx++; if (pmap_rididx == pmap_ridmapsz) pmap_rididx = 0; } rid = pmap_rididx * 64; /* Find a free bit. */ bit = 1UL; while (bits & bit) { rid++; bit <<= 1; } pmap_ridmap[pmap_rididx] |= bit; pmap_ridcount++; mtx_unlock(&pmap_ridmutex); return rid; } static void pmap_free_rid(uint32_t rid) { uint64_t bit; int idx; idx = rid / 64; bit = ~(1UL << (rid & 63)); mtx_lock(&pmap_ridmutex); pmap_ridmap[idx] &= bit; pmap_ridcount--; mtx_unlock(&pmap_ridmutex); } /*************************************************** * Page table page management routines..... ***************************************************/ void pmap_pinit0(struct pmap *pmap) { /* kernel_pmap is the same as any other pmap. */ pmap_pinit(pmap); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ -void +int pmap_pinit(struct pmap *pmap) { int i; PMAP_LOCK_INIT(pmap); for (i = 0; i < 5; i++) pmap->pm_rid[i] = pmap_allocate_rid(); pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + return (1); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { int i; for (i = 0; i < 5; i++) if (pmap->pm_rid[i]) pmap_free_rid(pmap->pm_rid[i]); PMAP_LOCK_DESTROY(pmap); } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { struct ia64_lpte **dir1; struct ia64_lpte *leaf; vm_page_t nkpg; while (kernel_vm_end <= addr) { if (nkpt == PAGE_SIZE/8 + PAGE_SIZE*PAGE_SIZE/64) panic("%s: out of kernel address space", __func__); dir1 = ia64_kptdir[KPTE_DIR0_INDEX(kernel_vm_end)]; if (dir1 == NULL) { nkpg = vm_page_alloc(NULL, nkpt++, VM_ALLOC_NOOBJ|VM_ALLOC_INTERRUPT|VM_ALLOC_WIRED); if (!nkpg) panic("%s: cannot add dir. page", __func__); dir1 = (struct ia64_lpte **) IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(nkpg)); bzero(dir1, PAGE_SIZE); ia64_kptdir[KPTE_DIR0_INDEX(kernel_vm_end)] = dir1; } nkpg = vm_page_alloc(NULL, nkpt++, VM_ALLOC_NOOBJ|VM_ALLOC_INTERRUPT|VM_ALLOC_WIRED); if (!nkpg) panic("%s: cannot add PTE page", __func__); leaf = (struct ia64_lpte *) IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(nkpg)); bzero(leaf, PAGE_SIZE); dir1[KPTE_DIR1_INDEX(kernel_vm_end)] = leaf; kernel_vm_end += PAGE_SIZE * NKPTEPG; } } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ static PMAP_INLINE void free_pv_entry(pv_entry_t pv) { pv_entry_count--; uma_zfree(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t locked_pmap) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; struct vpgqueues *vpq; struct ia64_lpte *pte; pmap_t oldpmap, pmap; pv_entry_t allocated_pv, next_pv, pv; vm_offset_t va; vm_page_t m; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); allocated_pv = uma_zalloc(pvzone, M_NOWAIT); if (allocated_pv != NULL) { pv_entry_count++; if (pv_entry_count > pv_entry_high_water) pagedaemon_wakeup(); else return (allocated_pv); } /* * Reclaim pv entries: At first, destroy mappings to inactive * pages. After that, if a pv entry is still needed, destroy * mappings to active pages. */ if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, " "increase the vm.pmap.shpgperproc tunable.\n"); vpq = &vm_page_queues[PQ_INACTIVE]; retry: TAILQ_FOREACH(m, &vpq->pl, pageq) { if (m->hold_count || m->busy) continue; TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) { va = pv->pv_va; pmap = pv->pv_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) continue; oldpmap = pmap_switch(pmap); pte = pmap_find_vhpt(va); KASSERT(pte != NULL, ("pte")); pmap_remove_pte(pmap, pte, va, pv, 1); pmap_switch(oldpmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); if (allocated_pv == NULL) allocated_pv = pv; else free_pv_entry(pv); } } if (allocated_pv == NULL) { if (vpq == &vm_page_queues[PQ_INACTIVE]) { vpq = &vm_page_queues[PQ_ACTIVE]; goto retry; } panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable"); } return (allocated_pv); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = uma_zalloc(pvzone, M_NOWAIT)) != NULL) { pv_entry_count++; pv->pv_va = va; pv->pv_pmap = pmap; TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; return (TRUE); } else return (FALSE); } /* * Add an ia64_lpte to the VHPT. */ static void pmap_enter_vhpt(struct ia64_lpte *pte, vm_offset_t va) { struct ia64_bucket *bckt; struct ia64_lpte *vhpte; uint64_t pte_pa; /* Can fault, so get it out of the way. */ pte_pa = ia64_tpa((vm_offset_t)pte); vhpte = (struct ia64_lpte *)ia64_thash(va); bckt = (struct ia64_bucket *)vhpte->chain; mtx_lock_spin(&bckt->mutex); pte->chain = bckt->chain; ia64_mf(); bckt->chain = pte_pa; pmap_vhpt_inserts++; bckt->length++; mtx_unlock_spin(&bckt->mutex); } /* * Remove the ia64_lpte matching va from the VHPT. Return zero if it * worked or an appropriate error code otherwise. */ static int pmap_remove_vhpt(vm_offset_t va) { struct ia64_bucket *bckt; struct ia64_lpte *pte; struct ia64_lpte *lpte; struct ia64_lpte *vhpte; uint64_t chain, tag; tag = ia64_ttag(va); vhpte = (struct ia64_lpte *)ia64_thash(va); bckt = (struct ia64_bucket *)vhpte->chain; lpte = NULL; mtx_lock_spin(&bckt->mutex); chain = bckt->chain; pte = (struct ia64_lpte *)IA64_PHYS_TO_RR7(chain); while (chain != 0 && pte->tag != tag) { lpte = pte; chain = pte->chain; pte = (struct ia64_lpte *)IA64_PHYS_TO_RR7(chain); } if (chain == 0) { mtx_unlock_spin(&bckt->mutex); return (ENOENT); } /* Snip this pv_entry out of the collision chain. */ if (lpte == NULL) bckt->chain = pte->chain; else lpte->chain = pte->chain; ia64_mf(); bckt->length--; mtx_unlock_spin(&bckt->mutex); return (0); } /* * Find the ia64_lpte for the given va, if any. */ static struct ia64_lpte * pmap_find_vhpt(vm_offset_t va) { struct ia64_bucket *bckt; struct ia64_lpte *pte; uint64_t chain, tag; tag = ia64_ttag(va); pte = (struct ia64_lpte *)ia64_thash(va); bckt = (struct ia64_bucket *)pte->chain; mtx_lock_spin(&bckt->mutex); chain = bckt->chain; pte = (struct ia64_lpte *)IA64_PHYS_TO_RR7(chain); while (chain != 0 && pte->tag != tag) { chain = pte->chain; pte = (struct ia64_lpte *)IA64_PHYS_TO_RR7(chain); } mtx_unlock_spin(&bckt->mutex); return ((chain != 0) ? pte : NULL); } /* * Remove an entry from the list of managed mappings. */ static int pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va, pv_entry_t pv) { if (!pv) { if (m->md.pv_list_count < pmap->pm_stats.resident_count) { TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pmap == pv->pv_pmap && va == pv->pv_va) break; } } else { TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { if (va == pv->pv_va) break; } } } if (pv) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; if (TAILQ_FIRST(&m->md.pv_list) == NULL) vm_page_flag_clear(m, PG_WRITEABLE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); return 0; } else { return ENOENT; } } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; pv = get_pv_entry(pmap); pv->pv_pmap = pmap; pv->pv_va = va; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { struct ia64_lpte *pte; pmap_t oldpmap; vm_paddr_t pa; pa = 0; PMAP_LOCK(pmap); oldpmap = pmap_switch(pmap); pte = pmap_find_vhpt(va); if (pte != NULL && pmap_present(pte)) pa = pmap_ppn(pte); pmap_switch(oldpmap); PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { struct ia64_lpte *pte; pmap_t oldpmap; vm_page_t m; m = NULL; vm_page_lock_queues(); PMAP_LOCK(pmap); oldpmap = pmap_switch(pmap); pte = pmap_find_vhpt(va); if (pte != NULL && pmap_present(pte) && (pmap_prot(pte) & prot) == prot) { m = PHYS_TO_VM_PAGE(pmap_ppn(pte)); vm_page_hold(m); } vm_page_unlock_queues(); pmap_switch(oldpmap); PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Find the kernel lpte for mapping the given virtual address, which * must be in the part of region 5 which we can cover with our kernel * 'page tables'. */ static struct ia64_lpte * pmap_find_kpte(vm_offset_t va) { struct ia64_lpte **dir1; struct ia64_lpte *leaf; KASSERT((va >> 61) == 5, ("kernel mapping 0x%lx not in region 5", va)); KASSERT(va < kernel_vm_end, ("kernel mapping 0x%lx out of range", va)); dir1 = ia64_kptdir[KPTE_DIR0_INDEX(va)]; leaf = dir1[KPTE_DIR1_INDEX(va)]; return (&leaf[KPTE_PTE_INDEX(va)]); } /* * Find a pte suitable for mapping a user-space address. If one exists * in the VHPT, that one will be returned, otherwise a new pte is * allocated. */ static struct ia64_lpte * pmap_find_pte(vm_offset_t va) { struct ia64_lpte *pte; if (va >= VM_MAXUSER_ADDRESS) return pmap_find_kpte(va); pte = pmap_find_vhpt(va); if (pte == NULL) { pte = uma_zalloc(ptezone, M_NOWAIT | M_ZERO); pte->tag = 1UL << 63; } return (pte); } /* * Free a pte which is now unused. This simply returns it to the zone * allocator if it is a user mapping. For kernel mappings, clear the * valid bit to make it clear that the mapping is not currently used. */ static void pmap_free_pte(struct ia64_lpte *pte, vm_offset_t va) { if (va < VM_MAXUSER_ADDRESS) uma_zfree(ptezone, pte); else pmap_clear_present(pte); } static PMAP_INLINE void pmap_pte_prot(pmap_t pm, struct ia64_lpte *pte, vm_prot_t prot) { static long prot2ar[4] = { PTE_AR_R, /* VM_PROT_NONE */ PTE_AR_RW, /* VM_PROT_WRITE */ PTE_AR_RX|PTE_ED, /* VM_PROT_EXECUTE */ PTE_AR_RWX|PTE_ED /* VM_PROT_WRITE|VM_PROT_EXECUTE */ }; pte->pte &= ~(PTE_PROT_MASK | PTE_PL_MASK | PTE_AR_MASK | PTE_ED); pte->pte |= (uint64_t)(prot & VM_PROT_ALL) << 56; pte->pte |= (prot == VM_PROT_NONE || pm == kernel_pmap) ? PTE_PL_KERN : PTE_PL_USER; pte->pte |= prot2ar[(prot & VM_PROT_ALL) >> 1]; } /* * Set a pte to contain a valid mapping and enter it in the VHPT. If * the pte was orginally valid, then its assumed to already be in the * VHPT. * This functions does not set the protection bits. It's expected * that those have been set correctly prior to calling this function. */ static void pmap_set_pte(struct ia64_lpte *pte, vm_offset_t va, vm_offset_t pa, boolean_t wired, boolean_t managed) { pte->pte &= PTE_PROT_MASK | PTE_PL_MASK | PTE_AR_MASK | PTE_ED; pte->pte |= PTE_PRESENT | PTE_MA_WB; pte->pte |= (managed) ? PTE_MANAGED : (PTE_DIRTY | PTE_ACCESSED); pte->pte |= (wired) ? PTE_WIRED : 0; pte->pte |= pa & PTE_PPN_MASK; pte->itir = PAGE_SHIFT << 2; pte->tag = ia64_ttag(va); } /* * Remove the (possibly managed) mapping represented by pte from the * given pmap. */ static int pmap_remove_pte(pmap_t pmap, struct ia64_lpte *pte, vm_offset_t va, pv_entry_t pv, int freepte) { int error; vm_page_t m; KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)), ("removing pte for non-current pmap")); /* * First remove from the VHPT. */ error = pmap_remove_vhpt(va); if (error) return (error); pmap_invalidate_page(pmap, va); if (pmap_wired(pte)) pmap->pm_stats.wired_count -= 1; pmap->pm_stats.resident_count -= 1; if (pmap_managed(pte)) { m = PHYS_TO_VM_PAGE(pmap_ppn(pte)); if (pmap_dirty(pte)) vm_page_dirty(m); if (pmap_accessed(pte)) vm_page_flag_set(m, PG_REFERENCED); error = pmap_remove_entry(pmap, m, va, pv); } if (freepte) pmap_free_pte(pte, va); return (error); } /* * Extract the physical page address associated with a kernel * virtual address. */ vm_paddr_t pmap_kextract(vm_offset_t va) { struct ia64_lpte *pte; vm_offset_t gwpage; KASSERT(va >= IA64_RR_BASE(5), ("Must be kernel VA")); /* Regions 6 and 7 are direct mapped. */ if (va >= IA64_RR_BASE(6)) return (IA64_RR_MASK(va)); /* EPC gateway page? */ gwpage = (vm_offset_t)ia64_get_k5(); if (va >= gwpage && va < gwpage + VM_GATEWAY_SIZE) return (IA64_RR_MASK((vm_offset_t)ia64_gateway_page)); /* Bail out if the virtual address is beyond our limits. */ if (va >= kernel_vm_end) return (0); pte = pmap_find_kpte(va); if (!pmap_present(pte)) return (0); return (pmap_ppn(pte) | (va & PAGE_MASK)); } /* * Add a list of wired pages to the kva this routine is only used for * temporary kernel mappings that do not need to have page modification * or references recorded. Note that old mappings are simply written * over. The page is effectively wired, but it's customary to not have * the PTE reflect that, nor update statistics. */ void pmap_qenter(vm_offset_t va, vm_page_t *m, int count) { struct ia64_lpte *pte; int i; for (i = 0; i < count; i++) { pte = pmap_find_kpte(va); if (pmap_present(pte)) pmap_invalidate_page(kernel_pmap, va); else pmap_enter_vhpt(pte, va); pmap_pte_prot(kernel_pmap, pte, VM_PROT_ALL); pmap_set_pte(pte, va, VM_PAGE_TO_PHYS(m[i]), FALSE, FALSE); va += PAGE_SIZE; } } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t va, int count) { struct ia64_lpte *pte; int i; for (i = 0; i < count; i++) { pte = pmap_find_kpte(va); if (pmap_present(pte)) { pmap_remove_vhpt(va); pmap_invalidate_page(kernel_pmap, va); pmap_clear_present(pte); } va += PAGE_SIZE; } } /* * Add a wired page to the kva. As for pmap_qenter(), it's customary * to not have the PTE reflect that, nor update statistics. */ void pmap_kenter(vm_offset_t va, vm_offset_t pa) { struct ia64_lpte *pte; pte = pmap_find_kpte(va); if (pmap_present(pte)) pmap_invalidate_page(kernel_pmap, va); else pmap_enter_vhpt(pte, va); pmap_pte_prot(kernel_pmap, pte, VM_PROT_ALL); pmap_set_pte(pte, va, pa, FALSE, FALSE); } /* * Remove a page from the kva */ void pmap_kremove(vm_offset_t va) { struct ia64_lpte *pte; pte = pmap_find_kpte(va); if (pmap_present(pte)) { pmap_remove_vhpt(va); pmap_invalidate_page(kernel_pmap, va); pmap_clear_present(pte); } } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot) { return IA64_PHYS_TO_RR7(start); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va) { struct ia64_lpte *pte; KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)), ("removing page for non-current pmap")); pte = pmap_find_vhpt(va); if (pte != NULL) pmap_remove_pte(pmap, pte, va, 0, 1); return; } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pmap_t oldpmap; vm_offset_t va; pv_entry_t npv, pv; struct ia64_lpte *pte; if (pmap->pm_stats.resident_count == 0) return; vm_page_lock_queues(); PMAP_LOCK(pmap); oldpmap = pmap_switch(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { pmap_remove_page(pmap, sva); goto out; } if (pmap->pm_stats.resident_count < ((eva - sva) >> PAGE_SHIFT)) { TAILQ_FOREACH_SAFE(pv, &pmap->pm_pvlist, pv_plist, npv) { va = pv->pv_va; if (va >= sva && va < eva) { pte = pmap_find_vhpt(va); KASSERT(pte != NULL, ("pte")); pmap_remove_pte(pmap, pte, va, pv, 1); } } } else { for (va = sva; va < eva; va += PAGE_SIZE) { pte = pmap_find_vhpt(va); if (pte != NULL) pmap_remove_pte(pmap, pte, va, 0, 1); } } out: vm_page_unlock_queues(); pmap_switch(oldpmap); PMAP_UNLOCK(pmap); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { pmap_t oldpmap; pv_entry_t pv; #if defined(DIAGNOSTIC) /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (m->flags & PG_FICTITIOUS) { panic("pmap_page_protect: illegal for unmanaged page, va: 0x%lx", VM_PAGE_TO_PHYS(m)); } #endif mtx_assert(&vm_page_queue_mtx, MA_OWNED); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { struct ia64_lpte *pte; pmap_t pmap = pv->pv_pmap; vm_offset_t va = pv->pv_va; PMAP_LOCK(pmap); oldpmap = pmap_switch(pmap); pte = pmap_find_vhpt(va); KASSERT(pte != NULL, ("pte")); if (pmap_ppn(pte) != VM_PAGE_TO_PHYS(m)) panic("pmap_remove_all: pv_table for %lx is inconsistent", VM_PAGE_TO_PHYS(m)); pmap_remove_pte(pmap, pte, va, pv, 1); pmap_switch(oldpmap); PMAP_UNLOCK(pmap); } vm_page_flag_clear(m, PG_WRITEABLE); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { pmap_t oldpmap; struct ia64_lpte *pte; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; if ((sva & PAGE_MASK) || (eva & PAGE_MASK)) panic("pmap_protect: unaligned addresses"); vm_page_lock_queues(); PMAP_LOCK(pmap); oldpmap = pmap_switch(pmap); while (sva < eva) { /* * If page is invalid, skip this page */ pte = pmap_find_vhpt(sva); if (pte == NULL) { sva += PAGE_SIZE; continue; } if (pmap_prot(pte) != prot) { if (pmap_managed(pte)) { vm_offset_t pa = pmap_ppn(pte); vm_page_t m = PHYS_TO_VM_PAGE(pa); if (pmap_dirty(pte)) { vm_page_dirty(m); pmap_clear_dirty(pte); } if (pmap_accessed(pte)) { vm_page_flag_set(m, PG_REFERENCED); pmap_clear_accessed(pte); } } pmap_pte_prot(pmap, pte, prot); pmap_invalidate_page(pmap, sva); } sva += PAGE_SIZE; } vm_page_unlock_queues(); pmap_switch(oldpmap); PMAP_UNLOCK(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { pmap_t oldpmap; vm_offset_t pa; vm_offset_t opa; struct ia64_lpte origpte; struct ia64_lpte *pte; boolean_t managed; vm_page_lock_queues(); PMAP_LOCK(pmap); oldpmap = pmap_switch(pmap); va &= ~PAGE_MASK; #ifdef DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); #endif /* * Find (or create) a pte for the given mapping. */ while ((pte = pmap_find_pte(va)) == NULL) { pmap_switch(oldpmap); PMAP_UNLOCK(pmap); vm_page_unlock_queues(); VM_WAIT; vm_page_lock_queues(); PMAP_LOCK(pmap); oldpmap = pmap_switch(pmap); } origpte = *pte; if (!pmap_present(pte)) { opa = ~0UL; pmap_enter_vhpt(pte, va); } else opa = pmap_ppn(pte); managed = FALSE; pa = VM_PAGE_TO_PHYS(m); /* * Mapping has not changed, must be protection or wiring change. */ if (opa == pa) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && !pmap_wired(&origpte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_wired(&origpte)) pmap->pm_stats.wired_count--; managed = (pmap_managed(&origpte)) ? TRUE : FALSE; /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (managed && pmap_dirty(&origpte)) vm_page_dirty(m); pmap_invalidate_page(pmap, va); goto validate; } /* * Mapping has changed, invalidate old range and fall * through to handle validating new mapping. */ if (opa != ~0UL) { pmap_remove_pte(pmap, pte, va, 0, 0); pmap_enter_vhpt(pte, va); } /* * Enter on the PV list if part of our managed memory. */ if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); pmap_insert_entry(pmap, va, m); managed = TRUE; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. This * adds the pte to the VHPT if necessary. */ pmap_pte_prot(pmap, pte, prot); pmap_set_pte(pte, va, pa, wired, managed); if ((prot & VM_PROT_WRITE) != 0) vm_page_flag_set(m, PG_WRITEABLE); vm_page_unlock_queues(); pmap_switch(oldpmap); PMAP_UNLOCK(pmap); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { pmap_t oldpmap; vm_page_t m; vm_pindex_t diff, psize; VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED); psize = atop(end - start); m = m_start; PMAP_LOCK(pmap); oldpmap = pmap_switch(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot); m = TAILQ_NEXT(m, listq); } pmap_switch(oldpmap); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { pmap_t oldpmap; PMAP_LOCK(pmap); oldpmap = pmap_switch(pmap); pmap_enter_quick_locked(pmap, va, m, prot); pmap_switch(oldpmap); PMAP_UNLOCK(pmap); } static void pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct ia64_lpte *pte; boolean_t managed; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte = pmap_find_pte(va)) == NULL) return; if (!pmap_present(pte)) { /* Enter on the PV list if the page is managed. */ if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { if (!pmap_try_insert_pv_entry(pmap, va, m)) { pmap_free_pte(pte, va); return; } managed = TRUE; } else managed = FALSE; /* Increment counters. */ pmap->pm_stats.resident_count++; /* Initialise with R/O protection and enter into VHPT. */ pmap_enter_vhpt(pte, va); pmap_pte_prot(pmap, pte, prot & (VM_PROT_READ | VM_PROT_EXECUTE)); pmap_set_pte(pte, va, VM_PAGE_TO_PHYS(m), FALSE, managed); } } /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT(object->type == OBJT_DEVICE, ("pmap_object_init_pt: non-device object")); } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { pmap_t oldpmap; struct ia64_lpte *pte; PMAP_LOCK(pmap); oldpmap = pmap_switch(pmap); pte = pmap_find_vhpt(va); KASSERT(pte != NULL, ("pte")); if (wired && !pmap_wired(pte)) { pmap->pm_stats.wired_count++; pmap_set_wired(pte); } else if (!wired && pmap_wired(pte)) { pmap->pm_stats.wired_count--; pmap_clear_wired(pte); } pmap_switch(oldpmap); PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by * mapping it into virtual memory and using bzero to clear * its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(m)); bzero((caddr_t) va, PAGE_SIZE); } /* * pmap_zero_page_area zeros the specified hardware page by * mapping it into virtual memory and using bzero to clear * its contents. * * off and size must reside within a single page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(m)); bzero((char *)(caddr_t)va + off, size); } /* * pmap_zero_page_idle zeros the specified hardware page by * mapping it into virtual memory and using bzero to clear * its contents. This is for the vm_idlezero process. */ void pmap_zero_page_idle(vm_page_t m) { vm_offset_t va = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(m)); bzero((caddr_t) va, PAGE_SIZE); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(mdst)); bcopy((caddr_t) src, (caddr_t) dst, PAGE_SIZE); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { pv_entry_t pv; int loops = 0; if (m->flags & PG_FICTITIOUS) return FALSE; /* * Not found, check current mappings returning immediately if found. */ mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pv->pv_pmap == pmap) { return TRUE; } loops++; if (loops >= 16) break; } return (FALSE); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pmap_t oldpmap; pv_entry_t pv, npv; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } vm_page_lock_queues(); PMAP_LOCK(pmap); oldpmap = pmap_switch(pmap); for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { struct ia64_lpte *pte; npv = TAILQ_NEXT(pv, pv_plist); pte = pmap_find_vhpt(pv->pv_va); KASSERT(pte != NULL, ("pte")); if (!pmap_wired(pte)) pmap_remove_pte(pmap, pte, pv->pv_va, pv, 1); } pmap_switch(oldpmap); PMAP_UNLOCK(pmap); vm_page_unlock_queues(); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { struct ia64_lpte *pte; pmap_t oldpmap; pv_entry_t pv; int count = 0; if (m->flags & PG_FICTITIOUS) return 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { PMAP_LOCK(pv->pv_pmap); oldpmap = pmap_switch(pv->pv_pmap); pte = pmap_find_vhpt(pv->pv_va); KASSERT(pte != NULL, ("pte")); if (pmap_accessed(pte)) { count++; pmap_clear_accessed(pte); pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } pmap_switch(oldpmap); PMAP_UNLOCK(pv->pv_pmap); } return count; } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { struct ia64_lpte *pte; pmap_t oldpmap; pv_entry_t pv; boolean_t rv; rv = FALSE; if (m->flags & PG_FICTITIOUS) return (rv); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { PMAP_LOCK(pv->pv_pmap); oldpmap = pmap_switch(pv->pv_pmap); pte = pmap_find_vhpt(pv->pv_va); pmap_switch(oldpmap); KASSERT(pte != NULL, ("pte")); rv = pmap_dirty(pte) ? TRUE : FALSE; PMAP_UNLOCK(pv->pv_pmap); if (rv) break; } return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { struct ia64_lpte *pte; pte = pmap_find_vhpt(addr); if (pte != NULL && pmap_present(pte)) return (FALSE); return (TRUE); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct ia64_lpte *pte; pmap_t oldpmap; pv_entry_t pv; if (m->flags & PG_FICTITIOUS) return; TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { PMAP_LOCK(pv->pv_pmap); oldpmap = pmap_switch(pv->pv_pmap); pte = pmap_find_vhpt(pv->pv_va); KASSERT(pte != NULL, ("pte")); if (pmap_dirty(pte)) { pmap_clear_dirty(pte); pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } pmap_switch(oldpmap); PMAP_UNLOCK(pv->pv_pmap); } } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_page_t m) { struct ia64_lpte *pte; pmap_t oldpmap; pv_entry_t pv; if (m->flags & PG_FICTITIOUS) return; TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { PMAP_LOCK(pv->pv_pmap); oldpmap = pmap_switch(pv->pv_pmap); pte = pmap_find_vhpt(pv->pv_va); KASSERT(pte != NULL, ("pte")); if (pmap_accessed(pte)) { pmap_clear_accessed(pte); pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } pmap_switch(oldpmap); PMAP_UNLOCK(pv->pv_pmap); } } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct ia64_lpte *pte; pmap_t oldpmap, pmap; pv_entry_t pv; vm_prot_t prot; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0 || (m->flags & PG_WRITEABLE) == 0) return; TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = pv->pv_pmap; PMAP_LOCK(pmap); oldpmap = pmap_switch(pmap); pte = pmap_find_vhpt(pv->pv_va); KASSERT(pte != NULL, ("pte")); prot = pmap_prot(pte); if ((prot & VM_PROT_WRITE) != 0) { if (pmap_dirty(pte)) { vm_page_dirty(m); pmap_clear_dirty(pte); } prot &= ~VM_PROT_WRITE; pmap_pte_prot(pmap, pte, prot); pmap_invalidate_page(pmap, pv->pv_va); } pmap_switch(oldpmap); PMAP_UNLOCK(pmap); } vm_page_flag_clear(m, PG_WRITEABLE); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(vm_offset_t pa, vm_size_t size) { return (void*) IA64_PHYS_TO_RR6(pa); } /* * 'Unmap' a range mapped by pmap_mapdev(). */ void pmap_unmapdev(vm_offset_t va, vm_size_t size) { return; } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr) { pmap_t oldpmap; struct ia64_lpte *pte, tpte; int val = 0; PMAP_LOCK(pmap); oldpmap = pmap_switch(pmap); pte = pmap_find_vhpt(addr); if (pte != NULL) { tpte = *pte; pte = &tpte; } pmap_switch(oldpmap); PMAP_UNLOCK(pmap); if (pte == NULL) return 0; if (pmap_present(pte)) { vm_page_t m; vm_offset_t pa; val = MINCORE_INCORE; if (!pmap_managed(pte)) return val; pa = pmap_ppn(pte); m = PHYS_TO_VM_PAGE(pa); /* * Modified by us */ if (pmap_dirty(pte)) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; else { /* * Modified by someone */ vm_page_lock_queues(); if (pmap_is_modified(m)) val |= MINCORE_MODIFIED_OTHER; vm_page_unlock_queues(); } /* * Referenced by us */ if (pmap_accessed(pte)) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; else { /* * Referenced by someone */ vm_page_lock_queues(); if (pmap_ts_referenced(m)) { val |= MINCORE_REFERENCED_OTHER; vm_page_flag_set(m, PG_REFERENCED); } vm_page_unlock_queues(); } } return val; } void pmap_activate(struct thread *td) { pmap_switch(vmspace_pmap(td->td_proc->p_vmspace)); } pmap_t pmap_switch(pmap_t pm) { pmap_t prevpm; int i; critical_enter(); prevpm = PCPU_GET(current_pmap); if (prevpm == pm) goto out; if (prevpm != NULL) atomic_clear_32(&prevpm->pm_active, PCPU_GET(cpumask)); if (pm == NULL) { for (i = 0; i < 5; i++) { ia64_set_rr(IA64_RR_BASE(i), (i << 8)|(PAGE_SHIFT << 2)|1); } } else { for (i = 0; i < 5; i++) { ia64_set_rr(IA64_RR_BASE(i), (pm->pm_rid[i] << 8)|(PAGE_SHIFT << 2)|1); } atomic_set_32(&pm->pm_active, PCPU_GET(cpumask)); } PCPU_SET(current_pmap, pm); ia64_srlz_d(); out: critical_exit(); return (prevpm); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { return addr; } #include "opt_ddb.h" #ifdef DDB #include static const char* psnames[] = { "1B", "2B", "4B", "8B", "16B", "32B", "64B", "128B", "256B", "512B", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G", "2G" }; static void print_trs(int type) { struct ia64_pal_result res; int i, maxtr; struct { pt_entry_t pte; uint64_t itir; uint64_t ifa; struct ia64_rr rr; } buf; static const char *manames[] = { "WB", "bad", "bad", "bad", "UC", "UCE", "WC", "NaT", }; res = ia64_call_pal_static(PAL_VM_SUMMARY, 0, 0, 0); if (res.pal_status != 0) { db_printf("Can't get VM summary\n"); return; } if (type == 0) maxtr = (res.pal_result[0] >> 40) & 0xff; else maxtr = (res.pal_result[0] >> 32) & 0xff; db_printf("V RID Virtual Page Physical Page PgSz ED AR PL D A MA P KEY\n"); for (i = 0; i <= maxtr; i++) { bzero(&buf, sizeof(buf)); res = ia64_call_pal_stacked_physical (PAL_VM_TR_READ, i, type, ia64_tpa((uint64_t) &buf)); if (!(res.pal_result[0] & 1)) buf.pte &= ~PTE_AR_MASK; if (!(res.pal_result[0] & 2)) buf.pte &= ~PTE_PL_MASK; if (!(res.pal_result[0] & 4)) pmap_clear_dirty(&buf); if (!(res.pal_result[0] & 8)) buf.pte &= ~PTE_MA_MASK; db_printf("%d %06x %013lx %013lx %4s %d %d %d %d %d %-3s " "%d %06x\n", (int)buf.ifa & 1, buf.rr.rr_rid, buf.ifa >> 12, (buf.pte & PTE_PPN_MASK) >> 12, psnames[(buf.itir & ITIR_PS_MASK) >> 2], (buf.pte & PTE_ED) ? 1 : 0, (int)(buf.pte & PTE_AR_MASK) >> 9, (int)(buf.pte & PTE_PL_MASK) >> 7, (pmap_dirty(&buf)) ? 1 : 0, (pmap_accessed(&buf)) ? 1 : 0, manames[(buf.pte & PTE_MA_MASK) >> 2], (pmap_present(&buf)) ? 1 : 0, (int)((buf.itir & ITIR_KEY_MASK) >> 8)); } } DB_COMMAND(itr, db_itr) { print_trs(0); } DB_COMMAND(dtr, db_dtr) { print_trs(1); } DB_COMMAND(rr, db_rr) { int i; uint64_t t; struct ia64_rr rr; printf("RR RID PgSz VE\n"); for (i = 0; i < 8; i++) { __asm __volatile ("mov %0=rr[%1]" : "=r"(t) : "r"(IA64_RR_BASE(i))); *(uint64_t *) &rr = t; printf("%d %06x %4s %d\n", i, rr.rr_rid, psnames[rr.rr_ps], rr.rr_ve); } } DB_COMMAND(thash, db_thash) { if (!have_addr) return; db_printf("%p\n", (void *) ia64_thash(addr)); } DB_COMMAND(ttag, db_ttag) { if (!have_addr) return; db_printf("0x%lx\n", ia64_ttag(addr)); } DB_COMMAND(kpte, db_kpte) { struct ia64_lpte *pte; if (!have_addr) { db_printf("usage: kpte \n"); return; } if (addr < VM_MIN_KERNEL_ADDRESS) { db_printf("kpte: error: invalid \n"); return; } pte = pmap_find_kpte(addr); db_printf("kpte at %p:\n", pte); db_printf(" pte =%016lx\n", pte->pte); db_printf(" itir =%016lx\n", pte->itir); db_printf(" tag =%016lx\n", pte->tag); db_printf(" chain=%016lx\n", pte->chain); } #endif Index: head/sys/kern/imgact_aout.c =================================================================== --- head/sys/kern/imgact_aout.c (revision 173360) +++ head/sys/kern/imgact_aout.c (revision 173361) @@ -1,272 +1,274 @@ /*- * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int exec_aout_imgact(struct image_params *imgp); static int aout_fixup(register_t **stack_base, struct image_params *imgp); struct sysentvec aout_sysvec = { SYS_MAXSYSCALL, sysent, 0, 0, NULL, 0, NULL, NULL, aout_fixup, sendsig, sigcode, &szsigcode, NULL, "FreeBSD a.out", NULL, NULL, MINSIGSTKSZ, PAGE_SIZE, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK, PS_STRINGS, VM_PROT_ALL, exec_copyout_strings, exec_setregs, NULL }; static int aout_fixup(stack_base, imgp) register_t **stack_base; struct image_params *imgp; { return (suword(--(*stack_base), imgp->args->argc)); } static int exec_aout_imgact(imgp) struct image_params *imgp; { const struct exec *a_out = (const struct exec *) imgp->image_header; struct thread *td = curthread; struct vmspace *vmspace; vm_map_t map; vm_object_t object; vm_offset_t text_end, data_end; unsigned long virtual_offset; unsigned long file_offset; unsigned long bss_size; int error; /* * Linux and *BSD binaries look very much alike, * only the machine id is different: * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI. * NetBSD is in network byte order.. ugh. */ if (((a_out->a_magic >> 16) & 0xff) != 0x86 && ((a_out->a_magic >> 16) & 0xff) != 0 && ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86) return -1; /* * Set file/virtual offset based on a.out variant. * We do two cases: host byte order and network byte order * (for NetBSD compatibility) */ switch ((int)(a_out->a_magic & 0xffff)) { case ZMAGIC: virtual_offset = 0; if (a_out->a_text) { file_offset = PAGE_SIZE; } else { /* Bill's "screwball mode" */ file_offset = 0; } break; case QMAGIC: virtual_offset = PAGE_SIZE; file_offset = 0; /* Pass PS_STRINGS for BSD/OS binaries only. */ if (N_GETMID(*a_out) == MID_ZERO) imgp->ps_strings = aout_sysvec.sv_psstrings; break; default: /* NetBSD compatibility */ switch ((int)(ntohl(a_out->a_magic) & 0xffff)) { case ZMAGIC: case QMAGIC: virtual_offset = PAGE_SIZE; file_offset = 0; break; default: return (-1); } } bss_size = roundup(a_out->a_bss, PAGE_SIZE); /* * Check various fields in header for validity/bounds. */ if (/* entry point must lay with text region */ a_out->a_entry < virtual_offset || a_out->a_entry >= virtual_offset + a_out->a_text || /* text and data size must each be page rounded */ a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) return (-1); /* text + data can't exceed file size */ if (a_out->a_data + a_out->a_text > imgp->attr->va_size) return (EFAULT); /* * text/data/bss must not exceed limits */ PROC_LOCK(imgp->proc); if (/* text can't exceed maximum text size */ a_out->a_text > maxtsiz || /* data + bss can't exceed rlimit */ a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA)) { PROC_UNLOCK(imgp->proc); return (ENOMEM); } PROC_UNLOCK(imgp->proc); /* * Avoid a possible deadlock if the current address space is destroyed * and that address space maps the locked vnode. In the common case, * the locked vnode's v_usecount is decremented but remains greater * than zero. Consequently, the vnode lock is not needed by vrele(). * However, in cases where the vnode lock is external, such as nullfs, * v_usecount may become zero. */ VOP_UNLOCK(imgp->vp, 0, td); /* * Destroy old process VM and create a new one (with a new stack) */ - exec_new_vmspace(imgp, &aout_sysvec); + error = exec_new_vmspace(imgp, &aout_sysvec); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); + if (error) + return (error); /* * The vm space can be changed by exec_new_vmspace */ vmspace = imgp->proc->p_vmspace; object = imgp->object; map = &vmspace->vm_map; vm_map_lock(map); vm_object_reference(object); text_end = virtual_offset + a_out->a_text; error = vm_map_insert(map, object, file_offset, virtual_offset, text_end, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL, MAP_COPY_ON_WRITE | MAP_PREFAULT); if (error) { vm_map_unlock(map); vm_object_deallocate(object); return (error); } data_end = text_end + a_out->a_data; if (a_out->a_data) { vm_object_reference(object); error = vm_map_insert(map, object, file_offset + a_out->a_text, text_end, data_end, VM_PROT_ALL, VM_PROT_ALL, MAP_COPY_ON_WRITE | MAP_PREFAULT); if (error) { vm_map_unlock(map); vm_object_deallocate(object); return (error); } } if (bss_size) { error = vm_map_insert(map, NULL, 0, data_end, data_end + bss_size, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) { vm_map_unlock(map); return (error); } } vm_map_unlock(map); /* Fill in process VM information */ vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT; vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset; vmspace->vm_daddr = (caddr_t) (uintptr_t) (virtual_offset + a_out->a_text); /* Fill in image_params */ imgp->interpreted = 0; imgp->entry_addr = a_out->a_entry; imgp->proc->p_sysent = &aout_sysvec; return (0); } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw aout_execsw = { exec_aout_imgact, "a.out" }; EXEC_SET(aout, aout_execsw); Index: head/sys/kern/imgact_elf.c =================================================================== --- head/sys/kern/imgact_elf.c (revision 173360) +++ head/sys/kern/imgact_elf.c (revision 173361) @@ -1,1301 +1,1303 @@ /*- * Copyright (c) 2000 David O'Brien * Copyright (c) 1995-1996 Søren Schmidt * Copyright (c) 1996 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32 #include #include #endif #define OLD_EI_BRAND 8 static int __elfN(check_header)(const Elf_Ehdr *hdr); static Elf_Brandinfo *__elfN(get_brandinfo)(const Elf_Ehdr *hdr, const char *interp); static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry, size_t pagesize); static int __elfN(load_section)(struct vmspace *vmspace, vm_object_t object, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot, size_t pagesize); static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp); SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0, ""); int __elfN(fallback_brand) = -1; SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, fallback_brand, CTLFLAG_RW, &__elfN(fallback_brand), 0, __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort"); TUNABLE_INT("kern.elf" __XSTRING(__ELF_WORD_SIZE) ".fallback_brand", &__elfN(fallback_brand)); static int elf_trace = 0; SYSCTL_INT(_debug, OID_AUTO, __elfN(trace), CTLFLAG_RW, &elf_trace, 0, ""); static int elf_legacy_coredump = 0; SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, &elf_legacy_coredump, 0, ""); static Elf_Brandinfo *elf_brand_list[MAX_BRANDS]; int __elfN(insert_brand_entry)(Elf_Brandinfo *entry) { int i; for (i = 0; i < MAX_BRANDS; i++) { if (elf_brand_list[i] == NULL) { elf_brand_list[i] = entry; break; } } if (i == MAX_BRANDS) return (-1); return (0); } int __elfN(remove_brand_entry)(Elf_Brandinfo *entry) { int i; for (i = 0; i < MAX_BRANDS; i++) { if (elf_brand_list[i] == entry) { elf_brand_list[i] = NULL; break; } } if (i == MAX_BRANDS) return (-1); return (0); } int __elfN(brand_inuse)(Elf_Brandinfo *entry) { struct proc *p; int rval = FALSE; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_sysent == entry->sysvec) { rval = TRUE; break; } } sx_sunlock(&allproc_lock); return (rval); } static Elf_Brandinfo * __elfN(get_brandinfo)(const Elf_Ehdr *hdr, const char *interp) { Elf_Brandinfo *bi; int i; /* * We support three types of branding -- (1) the ELF EI_OSABI field * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string * branding w/in the ELF header, and (3) path of the `interp_path' * field. We should also look for an ".note.ABI-tag" ELF section now * in all Linux ELF binaries, FreeBSD 4.1+, and some NetBSD ones. */ /* If the executable has a brand, search for it in the brand list. */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi != NULL && hdr->e_machine == bi->machine && (hdr->e_ident[EI_OSABI] == bi->brand || strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND], bi->compat_3_brand, strlen(bi->compat_3_brand)) == 0)) return (bi); } /* Lacking a known brand, search for a recognized interpreter. */ if (interp != NULL) { for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi != NULL && hdr->e_machine == bi->machine && strcmp(interp, bi->interp_path) == 0) return (bi); } } /* Lacking a recognized interpreter, try the default brand */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi != NULL && hdr->e_machine == bi->machine && __elfN(fallback_brand) == bi->brand) return (bi); } return (NULL); } static int __elfN(check_header)(const Elf_Ehdr *hdr) { Elf_Brandinfo *bi; int i; if (!IS_ELF(*hdr) || hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA || hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_phentsize != sizeof(Elf_Phdr) || hdr->e_version != ELF_TARG_VER) return (ENOEXEC); /* * Make sure we have at least one brand for this machine. */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi != NULL && bi->machine == hdr->e_machine) break; } if (i == MAX_BRANDS) return (ENOEXEC); return (0); } static int __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot) { struct sf_buf *sf; int error; vm_offset_t off; /* * Create the page if it doesn't exist yet. Ignore errors. */ vm_map_lock(map); vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end), VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); /* * Find the page from the underlying object. */ if (object) { sf = vm_imgact_map_page(object, offset); if (sf == NULL) return (KERN_FAILURE); off = offset - trunc_page(offset); error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start, end - start); vm_imgact_unmap_page(sf); if (error) { return (KERN_FAILURE); } } return (KERN_SUCCESS); } static int __elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot, int cow) { struct sf_buf *sf; vm_offset_t off; vm_size_t sz; int error, rv; if (start != trunc_page(start)) { rv = __elfN(map_partial)(map, object, offset, start, round_page(start), prot); if (rv) return (rv); offset += round_page(start) - start; start = round_page(start); } if (end != round_page(end)) { rv = __elfN(map_partial)(map, object, offset + trunc_page(end) - start, trunc_page(end), end, prot); if (rv) return (rv); end = trunc_page(end); } if (end > start) { if (offset & PAGE_MASK) { /* * The mapping is not page aligned. This means we have * to copy the data. Sigh. */ rv = vm_map_find(map, NULL, 0, &start, end - start, FALSE, prot | VM_PROT_WRITE, VM_PROT_ALL, 0); if (rv) return (rv); if (object == NULL) return (KERN_SUCCESS); for (; start < end; start += sz) { sf = vm_imgact_map_page(object, offset); if (sf == NULL) return (KERN_FAILURE); off = offset - trunc_page(offset); sz = end - start; if (sz > PAGE_SIZE - off) sz = PAGE_SIZE - off; error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start, sz); vm_imgact_unmap_page(sf); if (error) { return (KERN_FAILURE); } offset += sz; } rv = KERN_SUCCESS; } else { vm_object_reference(object); vm_map_lock(map); rv = vm_map_insert(map, object, offset, start, end, prot, VM_PROT_ALL, cow); vm_map_unlock(map); if (rv != KERN_SUCCESS) vm_object_deallocate(object); } return (rv); } else { return (KERN_SUCCESS); } } static int __elfN(load_section)(struct vmspace *vmspace, vm_object_t object, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot, size_t pagesize) { struct sf_buf *sf; size_t map_len; vm_offset_t map_addr; int error, rv, cow; size_t copy_len; vm_offset_t file_addr; /* * It's necessary to fail if the filsz + offset taken from the * header is greater than the actual file pager object's size. * If we were to allow this, then the vm_map_find() below would * walk right off the end of the file object and into the ether. * * While I'm here, might as well check for something else that * is invalid: filsz cannot be greater than memsz. */ if ((off_t)filsz + offset > object->un_pager.vnp.vnp_size || filsz > memsz) { uprintf("elf_load_section: truncated ELF file\n"); return (ENOEXEC); } #define trunc_page_ps(va, ps) ((va) & ~(ps - 1)) #define round_page_ps(va, ps) (((va) + (ps - 1)) & ~(ps - 1)) map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize); file_addr = trunc_page_ps(offset, pagesize); /* * We have two choices. We can either clear the data in the last page * of an oversized mapping, or we can start the anon mapping a page * early and copy the initialized data into that first page. We * choose the second.. */ if (memsz > filsz) map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr; else map_len = round_page_ps(offset + filsz, pagesize) - file_addr; if (map_len != 0) { /* cow flags: don't dump readonly sections in core */ cow = MAP_COPY_ON_WRITE | MAP_PREFAULT | (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP); rv = __elfN(map_insert)(&vmspace->vm_map, object, file_addr, /* file offset */ map_addr, /* virtual start */ map_addr + map_len,/* virtual end */ prot, cow); if (rv != KERN_SUCCESS) return (EINVAL); /* we can stop now if we've covered it all */ if (memsz == filsz) { return (0); } } /* * We have to get the remaining bit of the file into the first part * of the oversized map segment. This is normally because the .data * segment in the file is extended to provide bss. It's a neat idea * to try and save a page, but it's a pain in the behind to implement. */ copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize); map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize); map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) - map_addr; /* This had damn well better be true! */ if (map_len != 0) { rv = __elfN(map_insert)(&vmspace->vm_map, NULL, 0, map_addr, map_addr + map_len, VM_PROT_ALL, 0); if (rv != KERN_SUCCESS) { return (EINVAL); } } if (copy_len != 0) { vm_offset_t off; sf = vm_imgact_map_page(object, offset + filsz); if (sf == NULL) return (EIO); /* send the page fragment to user space */ off = trunc_page_ps(offset + filsz, pagesize) - trunc_page(offset + filsz); error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)map_addr, copy_len); vm_imgact_unmap_page(sf); if (error) { return (error); } } /* * set it to the specified protection. * XXX had better undo the damage from pasting over the cracks here! */ vm_map_protect(&vmspace->vm_map, trunc_page(map_addr), round_page(map_addr + map_len), prot, FALSE); return (0); } /* * Load the file "file" into memory. It may be either a shared object * or an executable. * * The "addr" reference parameter is in/out. On entry, it specifies * the address where a shared object should be loaded. If the file is * an executable, this value is ignored. On exit, "addr" specifies * where the file was actually loaded. * * The "entry" reference parameter is out only. On exit, it specifies * the entry point for the loaded file. */ static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry, size_t pagesize) { struct { struct nameidata nd; struct vattr attr; struct image_params image_params; } *tempdata; const Elf_Ehdr *hdr = NULL; const Elf_Phdr *phdr = NULL; struct nameidata *nd; struct vmspace *vmspace = p->p_vmspace; struct vattr *attr; struct image_params *imgp; vm_prot_t prot; u_long rbase; u_long base_addr = 0; int vfslocked, error, i, numsegs; if (curthread->td_proc != p) panic("elf_load_file - thread"); /* XXXKSE DIAGNOSTIC */ tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK); nd = &tempdata->nd; attr = &tempdata->attr; imgp = &tempdata->image_params; /* * Initialize part of the common data */ imgp->proc = p; imgp->attr = attr; imgp->firstpage = NULL; imgp->image_header = NULL; imgp->object = NULL; imgp->execlabel = NULL; /* XXXKSE */ NDINIT(nd, LOOKUP, MPSAFE|LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, curthread); vfslocked = 0; if ((error = namei(nd)) != 0) { nd->ni_vp = NULL; goto fail; } vfslocked = NDHASGIANT(nd); NDFREE(nd, NDF_ONLY_PNBUF); imgp->vp = nd->ni_vp; /* * Check permissions, modes, uid, etc on the file, and "open" it. */ error = exec_check_permissions(imgp); if (error) goto fail; error = exec_map_first_page(imgp); if (error) goto fail; /* * Also make certain that the interpreter stays the same, so set * its VV_TEXT flag, too. */ nd->ni_vp->v_vflag |= VV_TEXT; imgp->object = nd->ni_vp->v_object; hdr = (const Elf_Ehdr *)imgp->image_header; if ((error = __elfN(check_header)(hdr)) != 0) goto fail; if (hdr->e_type == ET_DYN) rbase = *addr; else if (hdr->e_type == ET_EXEC) rbase = 0; else { error = ENOEXEC; goto fail; } /* Only support headers that fit within first page for now */ /* (multiplication of two Elf_Half fields will not overflow) */ if ((hdr->e_phoff > PAGE_SIZE) || (hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE - hdr->e_phoff) { error = ENOEXEC; goto fail; } phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) { if (phdr[i].p_type == PT_LOAD) { /* Loadable segment */ prot = 0; if (phdr[i].p_flags & PF_X) prot |= VM_PROT_EXECUTE; if (phdr[i].p_flags & PF_W) prot |= VM_PROT_WRITE; if (phdr[i].p_flags & PF_R) prot |= VM_PROT_READ; if ((error = __elfN(load_section)(vmspace, imgp->object, phdr[i].p_offset, (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase, phdr[i].p_memsz, phdr[i].p_filesz, prot, pagesize)) != 0) goto fail; /* * Establish the base address if this is the * first segment. */ if (numsegs == 0) base_addr = trunc_page(phdr[i].p_vaddr + rbase); numsegs++; } } *addr = base_addr; *entry = (unsigned long)hdr->e_entry + rbase; fail: if (imgp->firstpage) exec_unmap_first_page(imgp); if (nd->ni_vp) vput(nd->ni_vp); VFS_UNLOCK_GIANT(vfslocked); free(tempdata, M_TEMP); return (error); } static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp) { const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header; const Elf_Phdr *phdr; Elf_Auxargs *elf_auxargs; struct vmspace *vmspace; vm_prot_t prot; u_long text_size = 0, data_size = 0, total_size = 0; u_long text_addr = 0, data_addr = 0; u_long seg_size, seg_addr; u_long addr, entry = 0, proghdr = 0; int error = 0, i; const char *interp = NULL; Elf_Brandinfo *brand_info; char *path; struct thread *td = curthread; struct sysentvec *sv; /* * Do we have a valid ELF header ? * * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later * if particular brand doesn't support it. */ if (__elfN(check_header)(hdr) != 0 || (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN)) return (-1); /* * From here on down, we return an errno, not -1, as we've * detected an ELF file. */ if ((hdr->e_phoff > PAGE_SIZE) || (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) { /* Only support headers in first page for now */ return (ENOEXEC); } phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); for (i = 0; i < hdr->e_phnum; i++) { if (phdr[i].p_type == PT_INTERP) { /* Path to interpreter */ if (phdr[i].p_filesz > MAXPATHLEN || phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE) return (ENOEXEC); interp = imgp->image_header + phdr[i].p_offset; break; } } brand_info = __elfN(get_brandinfo)(hdr, interp); if (brand_info == NULL) { uprintf("ELF binary type \"%u\" not known.\n", hdr->e_ident[EI_OSABI]); return (ENOEXEC); } if (hdr->e_type == ET_DYN && (brand_info->flags & BI_CAN_EXEC_DYN) == 0) return (ENOEXEC); sv = brand_info->sysvec; if (interp != NULL && brand_info->interp_newpath != NULL) interp = brand_info->interp_newpath; /* * Avoid a possible deadlock if the current address space is destroyed * and that address space maps the locked vnode. In the common case, * the locked vnode's v_usecount is decremented but remains greater * than zero. Consequently, the vnode lock is not needed by vrele(). * However, in cases where the vnode lock is external, such as nullfs, * v_usecount may become zero. */ VOP_UNLOCK(imgp->vp, 0, td); - exec_new_vmspace(imgp, sv); + error = exec_new_vmspace(imgp, sv); imgp->proc->p_sysent = sv; vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); + if (error) + return (error); vmspace = imgp->proc->p_vmspace; for (i = 0; i < hdr->e_phnum; i++) { switch (phdr[i].p_type) { case PT_LOAD: /* Loadable segment */ prot = 0; if (phdr[i].p_flags & PF_X) prot |= VM_PROT_EXECUTE; if (phdr[i].p_flags & PF_W) prot |= VM_PROT_WRITE; if (phdr[i].p_flags & PF_R) prot |= VM_PROT_READ; #if defined(__ia64__) && __ELF_WORD_SIZE == 32 && defined(IA32_ME_HARDER) /* * Some x86 binaries assume read == executable, * notably the M3 runtime and therefore cvsup */ if (prot & VM_PROT_READ) prot |= VM_PROT_EXECUTE; #endif if ((error = __elfN(load_section)(vmspace, imgp->object, phdr[i].p_offset, (caddr_t)(uintptr_t)phdr[i].p_vaddr, phdr[i].p_memsz, phdr[i].p_filesz, prot, sv->sv_pagesize)) != 0) return (error); /* * If this segment contains the program headers, * remember their virtual address for the AT_PHDR * aux entry. Static binaries don't usually include * a PT_PHDR entry. */ if (phdr[i].p_offset == 0 && hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize <= phdr[i].p_filesz) proghdr = phdr[i].p_vaddr + hdr->e_phoff; seg_addr = trunc_page(phdr[i].p_vaddr); seg_size = round_page(phdr[i].p_memsz + phdr[i].p_vaddr - seg_addr); /* * Is this .text or .data? We can't use * VM_PROT_WRITE or VM_PROT_EXEC, it breaks the * alpha terribly and possibly does other bad * things so we stick to the old way of figuring * it out: If the segment contains the program * entry point, it's a text segment, otherwise it * is a data segment. * * Note that obreak() assumes that data_addr + * data_size == end of data load area, and the ELF * file format expects segments to be sorted by * address. If multiple data segments exist, the * last one will be used. */ if (hdr->e_entry >= phdr[i].p_vaddr && hdr->e_entry < (phdr[i].p_vaddr + phdr[i].p_memsz)) { text_size = seg_size; text_addr = seg_addr; entry = (u_long)hdr->e_entry; } else { data_size = seg_size; data_addr = seg_addr; } total_size += seg_size; break; case PT_PHDR: /* Program header table info */ proghdr = phdr[i].p_vaddr; break; default: break; } } if (data_addr == 0 && data_size == 0) { data_addr = text_addr; data_size = text_size; } /* * Check limits. It should be safe to check the * limits after loading the segments since we do * not actually fault in all the segments pages. */ PROC_LOCK(imgp->proc); if (data_size > lim_cur(imgp->proc, RLIMIT_DATA) || text_size > maxtsiz || total_size > lim_cur(imgp->proc, RLIMIT_VMEM)) { PROC_UNLOCK(imgp->proc); return (ENOMEM); } vmspace->vm_tsize = text_size >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr; vmspace->vm_dsize = data_size >> PAGE_SHIFT; vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr; /* * We load the dynamic linker where a userland call * to mmap(0, ...) would put it. The rationale behind this * calculation is that it leaves room for the heap to grow to * its maximum allowed size. */ addr = round_page((vm_offset_t)imgp->proc->p_vmspace->vm_daddr + lim_max(imgp->proc, RLIMIT_DATA)); PROC_UNLOCK(imgp->proc); imgp->entry_addr = entry; if (interp != NULL) { VOP_UNLOCK(imgp->vp, 0, td); if (brand_info->emul_path != NULL && brand_info->emul_path[0] != '\0') { path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); snprintf(path, MAXPATHLEN, "%s%s", brand_info->emul_path, interp); error = __elfN(load_file)(imgp->proc, path, &addr, &imgp->entry_addr, sv->sv_pagesize); free(path, M_TEMP); if (error == 0) interp = NULL; } if (interp != NULL) { error = __elfN(load_file)(imgp->proc, interp, &addr, &imgp->entry_addr, sv->sv_pagesize); } vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); if (error != 0) { uprintf("ELF interpreter %s not found\n", interp); return (error); } } /* * Construct auxargs table (used by the fixup routine) */ elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK); elf_auxargs->execfd = -1; elf_auxargs->phdr = proghdr; elf_auxargs->phent = hdr->e_phentsize; elf_auxargs->phnum = hdr->e_phnum; elf_auxargs->pagesz = PAGE_SIZE; elf_auxargs->base = addr; elf_auxargs->flags = 0; elf_auxargs->entry = entry; elf_auxargs->trace = elf_trace; imgp->auxargs = elf_auxargs; imgp->interpreted = 0; return (error); } #define suword __CONCAT(suword, __ELF_WORD_SIZE) int __elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp) { Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs; Elf_Addr *base; Elf_Addr *pos; base = (Elf_Addr *)*stack_base; pos = base + (imgp->args->argc + imgp->args->envc + 2); if (args->trace) { AUXARGS_ENTRY(pos, AT_DEBUG, 1); } if (args->execfd != -1) { AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); } AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; base--; suword(base, (long)imgp->args->argc); *stack_base = (register_t *)base; return (0); } /* * Code for generating ELF core dumps. */ typedef void (*segment_callback)(vm_map_entry_t, void *); /* Closure for cb_put_phdr(). */ struct phdr_closure { Elf_Phdr *phdr; /* Program header to fill in */ Elf_Off offset; /* Offset of segment in core file */ }; /* Closure for cb_size_segment(). */ struct sseg_closure { int count; /* Count of writable segments. */ size_t size; /* Total size of all writable segments. */ }; static void cb_put_phdr(vm_map_entry_t, void *); static void cb_size_segment(vm_map_entry_t, void *); static void each_writable_segment(struct thread *, segment_callback, void *); static int __elfN(corehdr)(struct thread *, struct vnode *, struct ucred *, int, void *, size_t); static void __elfN(puthdr)(struct thread *, void *, size_t *, int); static void __elfN(putnote)(void *, size_t *, const char *, int, const void *, size_t); extern int osreldate; int __elfN(coredump)(td, vp, limit) struct thread *td; struct vnode *vp; off_t limit; { struct ucred *cred = td->td_ucred; int error = 0; struct sseg_closure seginfo; void *hdr; size_t hdrsize; /* Size the program segments. */ seginfo.count = 0; seginfo.size = 0; each_writable_segment(td, cb_size_segment, &seginfo); /* * Calculate the size of the core file header area by making * a dry run of generating it. Nothing is written, but the * size is calculated. */ hdrsize = 0; __elfN(puthdr)(td, (void *)NULL, &hdrsize, seginfo.count); if (hdrsize + seginfo.size >= limit) return (EFAULT); /* * Allocate memory for building the header, fill it up, * and write it out. */ hdr = malloc(hdrsize, M_TEMP, M_WAITOK); if (hdr == NULL) { return (EINVAL); } error = __elfN(corehdr)(td, vp, cred, seginfo.count, hdr, hdrsize); /* Write the contents of all of the writable segments. */ if (error == 0) { Elf_Phdr *php; off_t offset; int i; php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1; offset = hdrsize; for (i = 0; i < seginfo.count; i++) { error = vn_rdwr_inchunks(UIO_WRITE, vp, (caddr_t)(uintptr_t)php->p_vaddr, php->p_filesz, offset, UIO_USERSPACE, IO_UNIT | IO_DIRECT, cred, NOCRED, NULL, curthread); /* XXXKSE */ if (error != 0) break; offset += php->p_filesz; php++; } } free(hdr, M_TEMP); return (error); } /* * A callback for each_writable_segment() to write out the segment's * program header entry. */ static void cb_put_phdr(entry, closure) vm_map_entry_t entry; void *closure; { struct phdr_closure *phc = (struct phdr_closure *)closure; Elf_Phdr *phdr = phc->phdr; phc->offset = round_page(phc->offset); phdr->p_type = PT_LOAD; phdr->p_offset = phc->offset; phdr->p_vaddr = entry->start; phdr->p_paddr = 0; phdr->p_filesz = phdr->p_memsz = entry->end - entry->start; phdr->p_align = PAGE_SIZE; phdr->p_flags = 0; if (entry->protection & VM_PROT_READ) phdr->p_flags |= PF_R; if (entry->protection & VM_PROT_WRITE) phdr->p_flags |= PF_W; if (entry->protection & VM_PROT_EXECUTE) phdr->p_flags |= PF_X; phc->offset += phdr->p_filesz; phc->phdr++; } /* * A callback for each_writable_segment() to gather information about * the number of segments and their total size. */ static void cb_size_segment(entry, closure) vm_map_entry_t entry; void *closure; { struct sseg_closure *ssc = (struct sseg_closure *)closure; ssc->count++; ssc->size += entry->end - entry->start; } /* * For each writable segment in the process's memory map, call the given * function with a pointer to the map entry and some arbitrary * caller-supplied data. */ static void each_writable_segment(td, func, closure) struct thread *td; segment_callback func; void *closure; { struct proc *p = td->td_proc; vm_map_t map = &p->p_vmspace->vm_map; vm_map_entry_t entry; vm_object_t backing_object, object; boolean_t ignore_entry; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { /* * Don't dump inaccessible mappings, deal with legacy * coredump mode. * * Note that read-only segments related to the elf binary * are marked MAP_ENTRY_NOCOREDUMP now so we no longer * need to arbitrarily ignore such segments. */ if (elf_legacy_coredump) { if ((entry->protection & VM_PROT_RW) != VM_PROT_RW) continue; } else { if ((entry->protection & VM_PROT_ALL) == 0) continue; } /* * Dont include memory segment in the coredump if * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in * madvise(2). Do not dump submaps (i.e. parts of the * kernel map). */ if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP)) continue; if ((object = entry->object.vm_object) == NULL) continue; /* Ignore memory-mapped devices and such things. */ VM_OBJECT_LOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_LOCK(backing_object); VM_OBJECT_UNLOCK(object); object = backing_object; } ignore_entry = object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE; VM_OBJECT_UNLOCK(object); if (ignore_entry) continue; (*func)(entry, closure); } vm_map_unlock_read(map); } /* * Write the core file header to the file, including padding up to * the page boundary. */ static int __elfN(corehdr)(td, vp, cred, numsegs, hdr, hdrsize) struct thread *td; struct vnode *vp; struct ucred *cred; int numsegs; size_t hdrsize; void *hdr; { size_t off; /* Fill in the header. */ bzero(hdr, hdrsize); off = 0; __elfN(puthdr)(td, hdr, &off, numsegs); /* Write it to the core file. */ return (vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0, UIO_SYSSPACE, IO_UNIT | IO_DIRECT, cred, NOCRED, NULL, td)); /* XXXKSE */ } #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32 typedef struct prstatus32 elf_prstatus_t; typedef struct prpsinfo32 elf_prpsinfo_t; typedef struct fpreg32 elf_prfpregset_t; typedef struct fpreg32 elf_fpregset_t; typedef struct reg32 elf_gregset_t; #else typedef prstatus_t elf_prstatus_t; typedef prpsinfo_t elf_prpsinfo_t; typedef prfpregset_t elf_prfpregset_t; typedef prfpregset_t elf_fpregset_t; typedef gregset_t elf_gregset_t; #endif static void __elfN(puthdr)(struct thread *td, void *dst, size_t *off, int numsegs) { struct { elf_prstatus_t status; elf_prfpregset_t fpregset; elf_prpsinfo_t psinfo; } *tempdata; elf_prstatus_t *status; elf_prfpregset_t *fpregset; elf_prpsinfo_t *psinfo; struct proc *p; struct thread *thr; size_t ehoff, noteoff, notesz, phoff; p = td->td_proc; ehoff = *off; *off += sizeof(Elf_Ehdr); phoff = *off; *off += (numsegs + 1) * sizeof(Elf_Phdr); noteoff = *off; /* * Don't allocate space for the notes if we're just calculating * the size of the header. We also don't collect the data. */ if (dst != NULL) { tempdata = malloc(sizeof(*tempdata), M_TEMP, M_ZERO|M_WAITOK); status = &tempdata->status; fpregset = &tempdata->fpregset; psinfo = &tempdata->psinfo; } else { tempdata = NULL; status = NULL; fpregset = NULL; psinfo = NULL; } if (dst != NULL) { psinfo->pr_version = PRPSINFO_VERSION; psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t); strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname)); /* * XXX - We don't fill in the command line arguments properly * yet. */ strlcpy(psinfo->pr_psargs, p->p_comm, sizeof(psinfo->pr_psargs)); } __elfN(putnote)(dst, off, "FreeBSD", NT_PRPSINFO, psinfo, sizeof *psinfo); /* * To have the debugger select the right thread (LWP) as the initial * thread, we dump the state of the thread passed to us in td first. * This is the thread that causes the core dump and thus likely to * be the right thread one wants to have selected in the debugger. */ thr = td; while (thr != NULL) { if (dst != NULL) { status->pr_version = PRSTATUS_VERSION; status->pr_statussz = sizeof(elf_prstatus_t); status->pr_gregsetsz = sizeof(elf_gregset_t); status->pr_fpregsetsz = sizeof(elf_fpregset_t); status->pr_osreldate = osreldate; status->pr_cursig = p->p_sig; status->pr_pid = thr->td_tid; #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32 fill_regs32(thr, &status->pr_reg); fill_fpregs32(thr, fpregset); #else fill_regs(thr, &status->pr_reg); fill_fpregs(thr, fpregset); #endif } __elfN(putnote)(dst, off, "FreeBSD", NT_PRSTATUS, status, sizeof *status); __elfN(putnote)(dst, off, "FreeBSD", NT_FPREGSET, fpregset, sizeof *fpregset); /* * Allow for MD specific notes, as well as any MD * specific preparations for writing MI notes. */ __elfN(dump_thread)(thr, dst, off); thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) : TAILQ_NEXT(thr, td_plist); if (thr == td) thr = TAILQ_NEXT(thr, td_plist); } notesz = *off - noteoff; if (dst != NULL) free(tempdata, M_TEMP); /* Align up to a page boundary for the program segments. */ *off = round_page(*off); if (dst != NULL) { Elf_Ehdr *ehdr; Elf_Phdr *phdr; struct phdr_closure phc; /* * Fill in the ELF header. */ ehdr = (Elf_Ehdr *)((char *)dst + ehoff); ehdr->e_ident[EI_MAG0] = ELFMAG0; ehdr->e_ident[EI_MAG1] = ELFMAG1; ehdr->e_ident[EI_MAG2] = ELFMAG2; ehdr->e_ident[EI_MAG3] = ELFMAG3; ehdr->e_ident[EI_CLASS] = ELF_CLASS; ehdr->e_ident[EI_DATA] = ELF_DATA; ehdr->e_ident[EI_VERSION] = EV_CURRENT; ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD; ehdr->e_ident[EI_ABIVERSION] = 0; ehdr->e_ident[EI_PAD] = 0; ehdr->e_type = ET_CORE; #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32 ehdr->e_machine = EM_386; #else ehdr->e_machine = ELF_ARCH; #endif ehdr->e_version = EV_CURRENT; ehdr->e_entry = 0; ehdr->e_phoff = phoff; ehdr->e_flags = 0; ehdr->e_ehsize = sizeof(Elf_Ehdr); ehdr->e_phentsize = sizeof(Elf_Phdr); ehdr->e_phnum = numsegs + 1; ehdr->e_shentsize = sizeof(Elf_Shdr); ehdr->e_shnum = 0; ehdr->e_shstrndx = SHN_UNDEF; /* * Fill in the program header entries. */ phdr = (Elf_Phdr *)((char *)dst + phoff); /* The note segement. */ phdr->p_type = PT_NOTE; phdr->p_offset = noteoff; phdr->p_vaddr = 0; phdr->p_paddr = 0; phdr->p_filesz = notesz; phdr->p_memsz = 0; phdr->p_flags = 0; phdr->p_align = 0; phdr++; /* All the writable segments from the program. */ phc.phdr = phdr; phc.offset = *off; each_writable_segment(td, cb_put_phdr, &phc); } } static void __elfN(putnote)(void *dst, size_t *off, const char *name, int type, const void *desc, size_t descsz) { Elf_Note note; note.n_namesz = strlen(name) + 1; note.n_descsz = descsz; note.n_type = type; if (dst != NULL) bcopy(¬e, (char *)dst + *off, sizeof note); *off += sizeof note; if (dst != NULL) bcopy(name, (char *)dst + *off, note.n_namesz); *off += roundup2(note.n_namesz, sizeof(Elf_Size)); if (dst != NULL) bcopy(desc, (char *)dst + *off, note.n_descsz); *off += roundup2(note.n_descsz, sizeof(Elf_Size)); } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw __elfN(execsw) = { __CONCAT(exec_, __elfN(imgact)), __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) }; EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw)); Index: head/sys/kern/imgact_gzip.c =================================================================== --- head/sys/kern/imgact_gzip.c (revision 173360) +++ head/sys/kern/imgact_gzip.c (revision 173361) @@ -1,399 +1,403 @@ /*- * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- */ /* * This module handles execution of a.out files which have been run through * "gzip". This saves diskspace, but wastes cpu-cycles and VM. * * TODO: * text-segments should be made R/O after being filled * is the vm-stuff safe ? * should handle the entire header of gzip'ed stuff. * inflate isn't quite reentrant yet... * error-handling is a mess... * so is the rest... * tidy up unnecesary includes */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct imgact_gzip { struct image_params *ip; struct exec a_out; int error; int gotheader; int where; u_char *inbuf; u_long offset; u_long output; u_long len; int idx; u_long virtual_offset, file_offset, file_end, bss_size; }; static int exec_gzip_imgact(struct image_params *imgp); static int NextByte(void *vp); static int do_aout_hdr(struct imgact_gzip *); static int Flush(void *vp, u_char *, u_long siz); static int exec_gzip_imgact(imgp) struct image_params *imgp; { int error, error2 = 0; const u_char *p = (const u_char *) imgp->image_header; struct imgact_gzip igz; struct inflate infl; struct vmspace *vmspace; /* If these four are not OK, it isn't a gzip file */ if (p[0] != 0x1f) return -1; /* 0 Simply magic */ if (p[1] != 0x8b) return -1; /* 1 Simply magic */ if (p[2] != 0x08) return -1; /* 2 Compression method */ if (p[9] != 0x03) return -1; /* 9 OS compressed on */ /* * If this one contains anything but a comment or a filename marker, * we don't want to chew on it */ if (p[3] & ~(0x18)) return ENOEXEC; /* 3 Flags */ /* These are of no use to us */ /* 4-7 Timestamp */ /* 8 Extra flags */ bzero(&igz, sizeof igz); bzero(&infl, sizeof infl); infl.gz_private = (void *) &igz; infl.gz_input = NextByte; infl.gz_output = Flush; igz.ip = imgp; igz.idx = 10; if (p[3] & 0x08) { /* skip a filename */ while (p[igz.idx++]) if (igz.idx >= PAGE_SIZE) return ENOEXEC; } if (p[3] & 0x10) { /* skip a comment */ while (p[igz.idx++]) if (igz.idx >= PAGE_SIZE) return ENOEXEC; } igz.len = imgp->attr->va_size; error = inflate(&infl); /* * The unzipped file may not even have been long enough to contain * a header giving Flush() a chance to return error. Check for this. */ if ( !igz.gotheader ) return ENOEXEC; if ( !error ) { vmspace = imgp->proc->p_vmspace; error = vm_map_protect(&vmspace->vm_map, (vm_offset_t) vmspace->vm_taddr, (vm_offset_t) (vmspace->vm_taddr + (vmspace->vm_tsize << PAGE_SHIFT)) , VM_PROT_READ|VM_PROT_EXECUTE,0); } if (igz.inbuf) { error2 = vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf, (vm_offset_t) igz.inbuf + PAGE_SIZE); } if (igz.error || error || error2) { printf("Output=%lu ", igz.output); printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n", error, igz.error, error2, igz.where); } if (igz.error) return igz.error; if (error) return ENOEXEC; if (error2) return error2; return 0; } static int do_aout_hdr(struct imgact_gzip * gz) { int error; struct thread *td = curthread; struct vmspace *vmspace; vm_offset_t vmaddr; /* * Set file/virtual offset based on a.out variant. We do two cases: * host byte order and network byte order (for NetBSD compatibility) */ switch ((int) (gz->a_out.a_magic & 0xffff)) { case ZMAGIC: gz->virtual_offset = 0; if (gz->a_out.a_text) { gz->file_offset = PAGE_SIZE; } else { /* Bill's "screwball mode" */ gz->file_offset = 0; } break; case QMAGIC: gz->virtual_offset = PAGE_SIZE; gz->file_offset = 0; break; default: /* NetBSD compatibility */ switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) { case ZMAGIC: case QMAGIC: gz->virtual_offset = PAGE_SIZE; gz->file_offset = 0; break; default: gz->where = __LINE__; return (-1); } } gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE); /* * Check various fields in header for validity/bounds. */ if ( /* entry point must lay with text region */ gz->a_out.a_entry < gz->virtual_offset || gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text || /* text and data size must each be page rounded */ gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) { gz->where = __LINE__; return (-1); } /* * text/data/bss must not exceed limits */ PROC_LOCK(gz->ip->proc); if ( /* text can't exceed maximum text size */ gz->a_out.a_text > maxtsiz || /* data + bss can't exceed rlimit */ gz->a_out.a_data + gz->bss_size > lim_cur(gz->ip->proc, RLIMIT_DATA)) { PROC_UNLOCK(gz->ip->proc); gz->where = __LINE__; return (ENOMEM); } PROC_UNLOCK(gz->ip->proc); /* Find out how far we should go */ gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data; /* * Avoid a possible deadlock if the current address space is destroyed * and that address space maps the locked vnode. In the common case, * the locked vnode's v_usecount is decremented but remains greater * than zero. Consequently, the vnode lock is not needed by vrele(). * However, in cases where the vnode lock is external, such as nullfs, * v_usecount may become zero. */ VOP_UNLOCK(gz->ip->vp, 0, td); /* * Destroy old process VM and create a new one (with a new stack) */ - exec_new_vmspace(gz->ip, &aout_sysvec); + error = exec_new_vmspace(gz->ip, &aout_sysvec); vn_lock(gz->ip->vp, LK_EXCLUSIVE | LK_RETRY, td); + if (error) { + gz->where = __LINE__; + return (error); + } vmspace = gz->ip->proc->p_vmspace; vmaddr = gz->virtual_offset; error = vm_mmap(&vmspace->vm_map, &vmaddr, gz->a_out.a_text + gz->a_out.a_data, VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED, OBJT_DEFAULT, NULL, 0); if (error) { gz->where = __LINE__; return (error); } if (gz->bss_size != 0) { /* * Allocate demand-zeroed area for uninitialized data. * "bss" = 'block started by symbol' - named after the * IBM 7090 instruction of the same name. */ vmaddr = gz->virtual_offset + gz->a_out.a_text + gz->a_out.a_data; error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, gz->bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) { gz->where = __LINE__; return (error); } } /* Fill in process VM information */ vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT; vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t) (uintptr_t) gz->virtual_offset; vmspace->vm_daddr = (caddr_t) (uintptr_t) (gz->virtual_offset + gz->a_out.a_text); /* Fill in image_params */ gz->ip->interpreted = 0; gz->ip->entry_addr = gz->a_out.a_entry; gz->ip->proc->p_sysent = &aout_sysvec; return 0; } static int NextByte(void *vp) { int error; struct imgact_gzip *igz = (struct imgact_gzip *) vp; if (igz->idx >= igz->len) { igz->where = __LINE__; return GZ_EOF; } if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) { return igz->inbuf[(igz->idx++) - igz->offset]; } if (igz->inbuf) { error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf, (vm_offset_t) igz->inbuf + PAGE_SIZE); if (error) { igz->where = __LINE__; igz->error = error; return GZ_EOF; } } igz->offset = igz->idx & ~PAGE_MASK; error = vm_mmap(kernel_map, /* map */ (vm_offset_t *) & igz->inbuf, /* address */ PAGE_SIZE, /* size */ VM_PROT_READ, /* protection */ VM_PROT_READ, /* max protection */ 0, /* flags */ OBJT_VNODE, /* handle type */ igz->ip->vp, /* vnode */ igz->offset); /* offset */ if (error) { igz->where = __LINE__; igz->error = error; return GZ_EOF; } return igz->inbuf[(igz->idx++) - igz->offset]; } static int Flush(void *vp, u_char * ptr, u_long siz) { struct imgact_gzip *gz = (struct imgact_gzip *) vp; u_char *p = ptr, *q; int i; /* First, find an a.out-header. */ if (gz->output < sizeof gz->a_out) { q = (u_char *) & gz->a_out; i = min(siz, sizeof gz->a_out - gz->output); bcopy(p, q + gz->output, i); gz->output += i; p += i; siz -= i; if (gz->output == sizeof gz->a_out) { gz->gotheader = 1; i = do_aout_hdr(gz); if (i == -1) { if (!gz->where) gz->where = __LINE__; gz->error = ENOEXEC; return ENOEXEC; } else if (i) { gz->where = __LINE__; gz->error = i; return ENOEXEC; } if (gz->file_offset == 0) { q = (u_char *) (uintptr_t) gz->virtual_offset; copyout(&gz->a_out, q, sizeof gz->a_out); } } } /* Skip over zero-padded first PAGE if needed */ if (gz->output < gz->file_offset && gz->output + siz > gz->file_offset) { i = min(siz, gz->file_offset - gz->output); gz->output += i; p += i; siz -= i; } if (gz->output >= gz->file_offset && gz->output < gz->file_end) { i = min(siz, gz->file_end - gz->output); q = (u_char *) (uintptr_t) (gz->virtual_offset + gz->output - gz->file_offset); copyout(p, q, i); gz->output += i; p += i; siz -= i; } gz->output += siz; return 0; } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"}; EXEC_SET(execgzip, gzip_execsw); Index: head/sys/kern/kern_exec.c =================================================================== --- head/sys/kern/kern_exec.c (revision 173360) +++ head/sys/kern/kern_exec.c (revision 173361) @@ -1,1299 +1,1301 @@ /*- * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p); static void exec_free_args(struct image_args *); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD, NULL, 0, sysctl_kern_ps_strings, "LU", ""); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD, NULL, 0, sysctl_kern_usrstack, "LU", ""); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, sysctl_kern_stackprot, "I", ""); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, ""); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_psstrings; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, sizeof(p->p_sysent->sv_psstrings)); return error; } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_usrstack; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, sizeof(p->p_sysent->sv_usrstack)); return error; } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif int execve(td, uap) struct thread *td; struct execve_args /* { char *fname; char **argv; char **envv; } */ *uap; { int error; struct image_args args; error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, NULL); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif int __mac_execve(td, uap) struct thread *td; struct __mac_execve_args /* { char *fname; char **argv; char **envv; struct mac *mac_p; } */ *uap; { #ifdef MAC int error; struct image_args args; error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, uap->mac_p); return (error); #else return (ENOSYS); #endif } /* * XXX: kern_execve has the astonishing property of not always returning to * the caller. If sufficiently bad things happen during the call to * do_execve(), it can end up calling exit1(); as a result, callers must * avoid doing anything which they might need to undo (e.g., allocating * memory). */ int kern_execve(td, args, mac_p) struct thread *td; struct image_args *args; struct mac *mac_p; { struct proc *p = td->td_proc; int error; AUDIT_ARG(argv, args->begin_argv, args->argc, args->begin_envv - args->begin_argv); AUDIT_ARG(envv, args->begin_envv, args->envc, args->endp - args->begin_envv); if (p->p_flag & P_HADTHREADS) { PROC_LOCK(p); if (thread_single(SINGLE_BOUNDARY)) { PROC_UNLOCK(p); exec_free_args(args); return (ERESTART); /* Try again later. */ } PROC_UNLOCK(p); } error = do_execve(td, args, mac_p); if (p->p_flag & P_HADTHREADS) { PROC_LOCK(p); /* * If success, we upgrade to SINGLE_EXIT state to * force other threads to suicide. */ if (error == 0) thread_single(SINGLE_EXIT); else thread_single_end(); PROC_UNLOCK(p); } return (error); } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(td, args, mac_p) struct thread *td; struct image_args *args; struct mac *mac_p; { struct proc *p = td->td_proc; struct nameidata nd, *ndp; struct ucred *newcred = NULL, *oldcred; struct uidinfo *euip; register_t *stack_base; int error, len, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts, *newsigacts; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *textvp = NULL; int credential_changing; int vfslocked; int textset; #ifdef MAC struct label *interplabel = NULL; int will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif vfslocked = 0; imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ imgp->proc = p; imgp->execlabel = NULL; imgp->attr = &attr; imgp->entry_addr = 0; imgp->vmspace_destroyed = 0; imgp->interpreted = 0; imgp->interpreter_name = args->buf + PATH_MAX + ARG_MAX; imgp->auxargs = NULL; imgp->vp = NULL; imgp->object = NULL; imgp->firstpage = NULL; imgp->ps_strings = 0; imgp->auxarg_size = 0; imgp->args = args; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif imgp->image_header = NULL; /* * Translate the file name. namei() returns a vnode pointer * in ni_vp amoung other things. * * XXXAUDIT: It would be desirable to also audit the name of the * interpreter if this is an interpreted binary. */ ndp = &nd; NDINIT(ndp, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | MPSAFE | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); interpret: error = namei(ndp); if (error) goto exec_fail; vfslocked = NDHASGIANT(ndp); imgp->vp = ndp->ni_vp; /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); /* * Set VV_TEXT now so no one can write to the executable while we're * activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ textset = imgp->vp->v_vflag & VV_TEXT; imgp->vp->v_vflag |= VV_TEXT; error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) { if (textset == 0) imgp->vp->v_vflag &= ~VV_TEXT; error = ENOEXEC; } goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * VV_TEXT needs to be unset for scripts. There is a short * period before we determine that something is a script where * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ imgp->vp->v_vflag &= ~VV_TEXT; /* free name buffer and old vnode */ NDFREE(ndp, NDF_ONLY_PNBUF); #ifdef MAC interplabel = mac_vnode_label_alloc(); mac_vnode_copy_label(ndp->ni_vp->v_label, interplabel); #endif vput(ndp->ni_vp); vm_object_deallocate(imgp->object); imgp->object = NULL; VFS_UNLOCK_GIANT(vfslocked); vfslocked = 0; /* set new name to that of the interpreter */ NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME | MPSAFE, UIO_SYSSPACE, imgp->interpreter_name, td); goto interpret; } /* * Copy out strings (args and env) and initialize stack base */ if (p->p_sysent->sv_copyout_strings) stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); else stack_base = exec_copyout_strings(imgp); /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup != NULL) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->args->argc); /* * For security and other reasons, the file descriptor table cannot * be shared after an exec. */ fdunshare(p, td); /* * Malloc things before we need locks. */ newcred = crget(); euip = uifind(attr.va_uid); i = imgp->args->begin_envv - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } /* close files on exec */ VOP_UNLOCK(imgp->vp, 0, td); fdcloseexec(td); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); /* Get a reference to the vnode prior to locking the proc */ VREF(ndp->ni_vp); /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ PROC_LOCK(p); if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; PROC_UNLOCK(p); newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); PROC_LOCK(p); p->p_sigacts = newsigacts; } else oldsigacts = NULL; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN); bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len); p->p_comm[len] = 0; /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if (p->p_pptr && (p->p_flag & P_PPWAIT)) { p->p_flag &= ~P_PPWAIT; wakeup(p->p_pptr); } /* * Implement image setuid/setgid. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ oldcred = p->p_ucred; credential_changing = 0; credential_changing |= (attr.va_mode & VSUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & VSGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interplabel, imgp); credential_changing |= will_transition; #endif if (credential_changing && (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (p->p_tracevp != NULL && priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0)) { mtx_lock(&ktrace_mtx); p->p_traceflag = 0; tracevp = p->p_tracevp; p->p_tracevp = NULL; tracecred = p->p_tracecred; p->p_tracecred = NULL; mtx_unlock(&ktrace_mtx); } #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * setugidsafety() may call closef() and then pfind() * which may grab the process lock. * fdcheckstd() may call falloc() which may block to * allocate memory, so temporarily drop the process lock. */ PROC_UNLOCK(p); setugidsafety(td); VOP_UNLOCK(imgp->vp, 0, td); error = fdcheckstd(td); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); if (error != 0) goto done1; PROC_LOCK(p); /* * Set the new credentials. */ crcopy(newcred, oldcred); if (attr.va_mode & VSUID) change_euid(newcred, euip); if (attr.va_mode & VSGID) change_egid(newcred, attr.va_gid); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, newcred, imgp->vp, interplabel, imgp); } #endif /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); p->p_ucred = newcred; newcred = NULL; } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { crcopy(newcred, oldcred); change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); p->p_ucred = newcred; newcred = NULL; } } /* * Store the vp for use in procfs. This vnode was referenced prior * to locking the proc lock. */ textvp = p->p_textvp; p->p_textvp = ndp->ni_vp; /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(&p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* * If tracing the process, trap to debugger so breakpoints * can be set before the program executes. * Use tdsignal to deliver signal to current thread, use * psignal may cause the signal to be delivered to wrong thread * because that thread will exit, remember we are going to enter * single thread mode. */ if (p->p_flag & P_TRACED) tdsignal(p, td, SIGTRAP, NULL); /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. * * The proc lock needs to be released before taking the PMC * SX. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { PROC_UNLOCK(p); pe.pm_credentialschanged = credential_changing; pe.pm_entryaddr = imgp->entry_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); } else PROC_UNLOCK(p); #else /* !HWPMC_HOOKS */ PROC_UNLOCK(p); #endif /* Set values passed into the program in registers. */ if (p->p_sysent->sv_setregs) (*p->p_sysent->sv_setregs)(td, imgp->entry_addr, (u_long)(uintptr_t)stack_base, imgp->ps_strings); else exec_setregs(td, imgp->entry_addr, (u_long)(uintptr_t)stack_base, imgp->ps_strings); vfs_mark_atime(imgp->vp, td); done1: /* * Free any resources malloc'd earlier that we didn't use. */ uifree(euip); if (newcred == NULL) crfree(oldcred); else crfree(newcred); VOP_UNLOCK(imgp->vp, 0, td); /* * Handle deferred decrement of ref counts. */ if (textvp != NULL) { int tvfslocked; tvfslocked = VFS_LOCK_GIANT(textvp->v_mount); vrele(textvp); VFS_UNLOCK_GIANT(tvfslocked); } if (ndp->ni_vp && error != 0) vrele(ndp->ni_vp); #ifdef KTRACE if (tracevp != NULL) { int tvfslocked; tvfslocked = VFS_LOCK_GIANT(tracevp->v_mount); vrele(tracevp); VFS_UNLOCK_GIANT(tvfslocked); } if (tracecred != NULL) crfree(tracecred); #endif vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); if (oldargs != NULL) pargs_drop(oldargs); if (newargs != NULL) pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); exec_fail_dealloc: /* * free various allocated resources */ if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { NDFREE(ndp, NDF_ONLY_PNBUF); vput(imgp->vp); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); if (error == 0) { /* * Stop the process here if its stop event mask has * the S_EXEC bit set. */ STOPEVENT(p, S_EXEC, 0); goto done2; } exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); done2: #ifdef MAC mac_execve_exit(imgp); if (interplabel != NULL) mac_vnode_label_free(interplabel); #endif VFS_UNLOCK_GIANT(vfslocked); exec_free_args(args); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(td, W_EXITCODE(0, SIGABRT)); /* NOT REACHED */ } return (error); } int exec_map_first_page(imgp) struct image_params *imgp; { int rv, i; int initial_pagein; vm_page_t ma[VM_INITIAL_PAGEIN]; vm_object_t object; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); object = imgp->vp->v_object; if (object == NULL) return (EACCES); VM_OBJECT_LOCK(object); ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { initial_pagein = VM_INITIAL_PAGEIN; if (initial_pagein > object->size) initial_pagein = object->size; for (i = 1; i < initial_pagein; i++) { if ((ma[i] = vm_page_lookup(object, i)) != NULL) { if (ma[i]->valid) break; if ((ma[i]->oflags & VPO_BUSY) || ma[i]->busy) break; vm_page_busy(ma[i]); } else { ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED); if (ma[i] == NULL) break; } } initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, 0); ma[0] = vm_page_lookup(object, 0); if ((rv != VM_PAGER_OK) || (ma[0] == NULL) || (ma[0]->valid == 0)) { if (ma[0]) { vm_page_lock_queues(); vm_page_free(ma[0]); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(object); return (EIO); } } vm_page_lock_queues(); vm_page_hold(ma[0]); vm_page_unlock_queues(); vm_page_wakeup(ma[0]); VM_OBJECT_UNLOCK(object); imgp->firstpage = sf_buf_alloc(ma[0], 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(imgp) struct image_params *imgp; { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_lock_queues(); vm_page_unhold(m); vm_page_unlock_queues(); } } /* * Destroy old address space, and allocate a new stack * The new stack is only SGROWSIZ large because it is grown * automatically in trap.c. */ int exec_new_vmspace(imgp, sv) struct image_params *imgp; struct sysentvec *sv; { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_offset_t stack_addr; vm_map_t map; u_long ssiz; imgp->vmspace_destroyed = 1; imgp->sysent = sv; /* May be called with Giant held */ EVENTHANDLER_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser && vm_map_max(map) == sv->sv_maxuser) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); } else { - vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser); + error = vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser); + if (error) + return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } /* Allocate a new stack */ if (sv->sv_maxssiz != NULL) ssiz = *sv->sv_maxssiz; else ssiz = maxssiz; stack_addr = sv->sv_usrstack - ssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error) return (error); #ifdef __ia64__ /* Allocate a new register stack */ stack_addr = IA64_BACKINGSTORE; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP); if (error) return (error); #endif /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the * VM_STACK case, but they are still used to monitor the size of the * process stack so we can check the stack rlimit. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz; return (0); } /* * Copy out argument and environment strings from the old process address * space into the temporary string buffer. */ int exec_copyin_args(struct image_args *args, char *fname, enum uio_seg segflg, char **argv, char **envv) { char *argp, *envp; int error; size_t length; error = 0; bzero(args, sizeof(*args)); if (argv == NULL) return (EFAULT); /* * Allocate temporary demand zeroed space for argument and * environment strings: * * o ARG_MAX for argument and environment; * o MAXSHELLCMDLEN for the name of interpreters. */ args->buf = (char *) kmem_alloc_wait(exec_map, PATH_MAX + ARG_MAX + MAXSHELLCMDLEN); if (args->buf == NULL) return (ENOMEM); args->begin_argv = args->buf; args->endp = args->begin_argv; args->stringspace = ARG_MAX; args->fname = args->buf + ARG_MAX; /* * Copy the file name. */ error = (segflg == UIO_SYSSPACE) ? copystr(fname, args->fname, PATH_MAX, &length) : copyinstr(fname, args->fname, PATH_MAX, &length); if (error != 0) goto err_exit; /* * extract arguments first */ while ((argp = (caddr_t) (intptr_t) fuword(argv++))) { if (argp == (caddr_t) -1) { error = EFAULT; goto err_exit; } if ((error = copyinstr(argp, args->endp, args->stringspace, &length))) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->argc++; } args->begin_envv = args->endp; /* * extract environment strings */ if (envv) { while ((envp = (caddr_t)(intptr_t)fuword(envv++))) { if (envp == (caddr_t)-1) { error = EFAULT; goto err_exit; } if ((error = copyinstr(envp, args->endp, args->stringspace, &length))) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->envc++; } } return (0); err_exit: exec_free_args(args); return (error); } static void exec_free_args(struct image_args *args) { if (args->buf) { kmem_free_wakeup(exec_map, (vm_offset_t)args->buf, PATH_MAX + ARG_MAX + MAXSHELLCMDLEN); args->buf = NULL; } } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ register_t * exec_copyout_strings(imgp) struct image_params *imgp; { int argc, envc; char **vectp; char *stringp, *destp; register_t *stack_base; struct ps_strings *arginfo; struct proc *p; int szsigcode; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ p = imgp->proc; szsigcode = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; if (p->p_sysent->sv_szsigcode != NULL) szsigcode = *(p->p_sysent->sv_szsigcode); destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE - roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *)); /* * install sigcode */ if (szsigcode) copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo - szsigcode), szsigcode); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *)); } else { /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(char *)); } /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(imgp) struct image_params *imgp; { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error; td = curthread; /* XXXKSE */ /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred, td); if (error) return (error); #ifdef MAC error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that this * file resides on. * 2) Insure that at least one execute bit is on - otherwise root * will always succeed, and we don't want to happen unless the * file really is executable. * 3) Insure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr->va_mode & 0111) == 0) || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. */ if (vp->v_writecount) return (ETXTBSY); /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); return (error); } /* * Exec handler registration */ int exec_register(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return (ENOMEM); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return (ENOMEM); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } Index: head/sys/kern/kern_fork.c =================================================================== --- head/sys/kern/kern_fork.c (revision 173360) +++ head/sys/kern/kern_fork.c (revision 173361) @@ -1,794 +1,821 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef _SYS_SYSPROTO_H_ struct fork_args { int dummy; }; #endif /* ARGSUSED */ int fork(td, uap) struct thread *td; struct fork_args *uap; { int error; struct proc *p2; error = fork1(td, RFFDG | RFPROC, 0, &p2); if (error == 0) { td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; } return (error); } /* ARGSUSED */ int vfork(td, uap) struct thread *td; struct vfork_args *uap; { int error; struct proc *p2; error = fork1(td, RFFDG | RFPROC | RFPPWAIT | RFMEM, 0, &p2); if (error == 0) { td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; } return (error); } int rfork(td, uap) struct thread *td; struct rfork_args *uap; { struct proc *p2; int error; /* Don't allow kernel-only flags. */ if ((uap->flags & RFKERNELONLY) != 0) return (EINVAL); AUDIT_ARG(fflags, uap->flags); error = fork1(td, uap->flags, 0, &p2); if (error == 0) { td->td_retval[0] = p2 ? p2->p_pid : 0; td->td_retval[1] = 0; } return (error); } int nprocs = 1; /* process 0 */ int lastpid = 0; SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, "Last used PID"); /* * Random component to lastpid generation. We mix in a random factor to make * it a little harder to predict. We sanity check the modulus value to avoid * doing it in critical paths. Don't let it be too small or we pointlessly * waste randomness entropy, and don't let it be impossibly large. Using a * modulus that is too big causes a LOT more process table scans and slows * down fork processing as the pidchecked caching is defeated. */ static int randompid = 0; static int sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) { int error, pid; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error != 0) return(error); sx_xlock(&allproc_lock); pid = randompid; error = sysctl_handle_int(oidp, &pid, 0, req); if (error == 0 && req->newptr != NULL) { if (pid < 0 || pid > PID_MAX - 100) /* out of range */ pid = PID_MAX - 100; else if (pid < 2) /* NOP */ pid = 0; else if (pid < 100) /* Make it reasonable */ pid = 100; randompid = pid; } sx_xunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_kern_randompid, "I", "Random PID modulus"); int fork1(td, flags, pages, procp) struct thread *td; int flags; int pages; struct proc **procp; { struct proc *p1, *p2, *pptr; struct proc *newproc; int ok, trypid; static int curfail, pidchecked = 0; static struct timeval lastfail; struct filedesc *fd; struct filedesc_to_leader *fdtol; struct thread *td2; struct sigacts *newsigacts; + struct vmspace *vm2; int error; /* Can't copy and clear. */ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) return (EINVAL); p1 = td->td_proc; /* * Here we don't create a new process, but we divorce * certain parts of a process from itself. */ if ((flags & RFPROC) == 0) { if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) && (flags & (RFCFDG | RFFDG))) { PROC_LOCK(p1); if (thread_single(SINGLE_BOUNDARY)) { PROC_UNLOCK(p1); return (ERESTART); } PROC_UNLOCK(p1); } - vm_forkproc(td, NULL, NULL, flags); + error = vm_forkproc(td, NULL, NULL, NULL, flags); + if (error) + goto norfproc_fail; /* * Close all file descriptors. */ if (flags & RFCFDG) { struct filedesc *fdtmp; fdtmp = fdinit(td->td_proc->p_fd); fdfree(td); p1->p_fd = fdtmp; } /* * Unshare file descriptors (from parent). */ if (flags & RFFDG) fdunshare(p1, td); +norfproc_fail: if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) && (flags & (RFCFDG | RFFDG))) { PROC_LOCK(p1); thread_single_end(); PROC_UNLOCK(p1); } *procp = NULL; - return (0); + return (error); } /* * XXX * We did have single-threading code here * however it proved un-needed and caused problems */ /* Allocate new proc. */ newproc = uma_zalloc(proc_zone, M_WAITOK); + if (TAILQ_EMPTY(&newproc->p_threads)) { + td2 = thread_alloc(); + if (td2 == NULL) { + error = ENOMEM; + goto fail1; + } + proc_linkup(newproc, td2); + sched_newproc(newproc, td2); + } else + td2 = FIRST_THREAD_IN_PROC(newproc); + + /* Allocate and switch to an alternate kstack if specified. */ + if (pages != 0) { + if (!vm_thread_new_altkstack(td2, pages)) { + error = ENOMEM; + goto fail1; + } + } + if ((flags & RFMEM) == 0) { + vm2 = vmspace_fork(p1->p_vmspace); + if (vm2 == NULL) { + error = ENOMEM; + goto fail1; + } + } else + vm2 = NULL; #ifdef MAC mac_proc_init(newproc); #endif knlist_init(&newproc->p_klist, &newproc->p_mtx, NULL, NULL, NULL); STAILQ_INIT(&newproc->p_ktr); /* We have to lock the process tree while we look for a pid. */ sx_slock(&proctree_lock); /* * Although process entries are dynamically created, we still keep * a global limit on the maximum number we will create. Don't allow * a nonprivileged user to use the last ten processes; don't let root * exceed the limit. The variable nprocs is the current number of * processes, maxproc is the limit. */ sx_xlock(&allproc_lock); if ((nprocs >= maxproc - 10 && priv_check_cred(td->td_ucred, PRIV_MAXPROC, 0) != 0) || nprocs >= maxproc) { error = EAGAIN; goto fail; } /* * Increment the count of procs running with this uid. Don't allow * a nonprivileged user to exceed their current limit. * * XXXRW: Can we avoid privilege here if it's not needed? */ error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0); if (error == 0) ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0); else { PROC_LOCK(p1); ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, lim_cur(p1, RLIMIT_NPROC)); PROC_UNLOCK(p1); } if (!ok) { error = EAGAIN; goto fail; } /* * Increment the nprocs resource before blocking can occur. There * are hard-limits as to the number of processes that can run. */ nprocs++; /* * Find an unused process ID. We remember a range of unused IDs * ready to use (from lastpid+1 through pidchecked-1). * * If RFHIGHPID is set (used during system boot), do not allocate * low-numbered pids. */ trypid = lastpid + 1; if (flags & RFHIGHPID) { if (trypid < 10) trypid = 10; } else { if (randompid) trypid += arc4random() % randompid; } retry: /* * If the process ID prototype has wrapped around, * restart somewhat above 0, as the low-numbered procs * tend to include daemons that don't exit. */ if (trypid >= PID_MAX) { trypid = trypid % PID_MAX; if (trypid < 100) trypid += 100; pidchecked = 0; } if (trypid >= pidchecked) { int doingzomb = 0; pidchecked = PID_MAX; /* * Scan the active and zombie procs to check whether this pid * is in use. Remember the lowest pid that's greater * than trypid, so we can avoid checking for a while. */ p2 = LIST_FIRST(&allproc); again: for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) { while (p2->p_pid == trypid || (p2->p_pgrp != NULL && (p2->p_pgrp->pg_id == trypid || (p2->p_session != NULL && p2->p_session->s_sid == trypid)))) { trypid++; if (trypid >= pidchecked) goto retry; } if (p2->p_pid > trypid && pidchecked > p2->p_pid) pidchecked = p2->p_pid; if (p2->p_pgrp != NULL) { if (p2->p_pgrp->pg_id > trypid && pidchecked > p2->p_pgrp->pg_id) pidchecked = p2->p_pgrp->pg_id; if (p2->p_session != NULL && p2->p_session->s_sid > trypid && pidchecked > p2->p_session->s_sid) pidchecked = p2->p_session->s_sid; } } if (!doingzomb) { doingzomb = 1; p2 = LIST_FIRST(&zombproc); goto again; } } sx_sunlock(&proctree_lock); /* * RFHIGHPID does not mess with the lastpid counter during boot. */ if (flags & RFHIGHPID) pidchecked = 0; else lastpid = trypid; p2 = newproc; - td2 = FIRST_THREAD_IN_PROC(newproc); p2->p_state = PRS_NEW; /* protect against others */ p2->p_pid = trypid; /* * Allow the scheduler to initialize the child. */ thread_lock(td); sched_fork(td, td2); thread_unlock(td); AUDIT_ARG(pid, p2->p_pid); LIST_INSERT_HEAD(&allproc, p2, p_list); LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); PROC_LOCK(p2); PROC_LOCK(p1); sx_xunlock(&allproc_lock); bcopy(&p1->p_startcopy, &p2->p_startcopy, __rangeof(struct proc, p_startcopy, p_endcopy)); PROC_UNLOCK(p1); bzero(&p2->p_startzero, __rangeof(struct proc, p_startzero, p_endzero)); p2->p_ucred = crhold(td->td_ucred); PROC_UNLOCK(p2); /* * Malloc things while we don't hold any locks. */ if (flags & RFSIGSHARE) newsigacts = NULL; else newsigacts = sigacts_alloc(); /* * Copy filedesc. */ if (flags & RFCFDG) { fd = fdinit(p1->p_fd); fdtol = NULL; } else if (flags & RFFDG) { fd = fdcopy(p1->p_fd); fdtol = NULL; } else { fd = fdshare(p1->p_fd); if (p1->p_fdtol == NULL) p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL, p1->p_leader); if ((flags & RFTHREAD) != 0) { /* * Shared file descriptor table and * shared process leaders. */ fdtol = p1->p_fdtol; FILEDESC_XLOCK(p1->p_fd); fdtol->fdl_refcount++; FILEDESC_XUNLOCK(p1->p_fd); } else { /* * Shared file descriptor table, and * different process leaders */ fdtol = filedesc_to_leader_alloc(p1->p_fdtol, p1->p_fd, p2); } } /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. */ - /* Allocate and switch to an alternate kstack if specified. */ - if (pages != 0) - vm_thread_new_altkstack(td2, pages); PROC_LOCK(p2); PROC_LOCK(p1); bzero(&td2->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &td2->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); td2->td_sigstk = td->td_sigstk; td2->td_sigmask = td->td_sigmask; td2->td_flags = TDF_INMEM; /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. */ p2->p_flag = P_INMEM; p2->p_swtick = ticks; if (p1->p_flag & P_PROFIL) startprofclock(p2); td2->td_ucred = crhold(p2->p_ucred); pargs_hold(p2->p_args); if (flags & RFSIGSHARE) { p2->p_sigacts = sigacts_hold(p1->p_sigacts); } else { sigacts_copy(newsigacts, p1->p_sigacts); p2->p_sigacts = newsigacts; } if (flags & RFLINUXTHPN) p2->p_sigparent = SIGUSR1; else p2->p_sigparent = SIGCHLD; p2->p_textvp = p1->p_textvp; p2->p_fd = fd; p2->p_fdtol = fdtol; /* * p_limit is copy-on-write. Bump its refcount. */ lim_fork(p1, p2); pstats_fork(p1->p_stats, p2->p_stats); PROC_UNLOCK(p1); PROC_UNLOCK(p2); /* Bump references to the text vnode (for procfs) */ if (p2->p_textvp) vref(p2->p_textvp); /* * Set up linkage for kernel based threading. */ if ((flags & RFTHREAD) != 0) { mtx_lock(&ppeers_lock); p2->p_peers = p1->p_peers; p1->p_peers = p2; p2->p_leader = p1->p_leader; mtx_unlock(&ppeers_lock); PROC_LOCK(p1->p_leader); if ((p1->p_leader->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(p1->p_leader); /* * The task leader is exiting, so process p1 is * going to be killed shortly. Since p1 obviously * isn't dead yet, we know that the leader is either * sending SIGKILL's to all the processes in this * task or is sleeping waiting for all the peers to * exit. We let p1 complete the fork, but we need * to go ahead and kill the new process p2 since * the task leader may not get a chance to send * SIGKILL to it. We leave it on the list so that * the task leader will wait for this new process * to commit suicide. */ PROC_LOCK(p2); psignal(p2, SIGKILL); PROC_UNLOCK(p2); } else PROC_UNLOCK(p1->p_leader); } else { p2->p_peers = NULL; p2->p_leader = p2; } sx_xlock(&proctree_lock); PGRP_LOCK(p1->p_pgrp); PROC_LOCK(p2); PROC_LOCK(p1); /* * Preserve some more flags in subprocess. P_PROFIL has already * been preserved. */ p2->p_flag |= p1->p_flag & P_SUGID; td2->td_pflags |= td->td_pflags & TDP_ALTSTACK; SESS_LOCK(p1->p_session); if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) p2->p_flag |= P_CONTROLT; SESS_UNLOCK(p1->p_session); if (flags & RFPPWAIT) p2->p_flag |= P_PPWAIT; p2->p_pgrp = p1->p_pgrp; LIST_INSERT_AFTER(p1, p2, p_pglist); PGRP_UNLOCK(p1->p_pgrp); LIST_INIT(&p2->p_children); callout_init(&p2->p_itcallout, CALLOUT_MPSAFE); #ifdef KTRACE /* * Copy traceflag and tracefile if enabled. */ mtx_lock(&ktrace_mtx); KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode")); if (p1->p_traceflag & KTRFAC_INHERIT) { p2->p_traceflag = p1->p_traceflag; if ((p2->p_tracevp = p1->p_tracevp) != NULL) { VREF(p2->p_tracevp); KASSERT(p1->p_tracecred != NULL, ("ktrace vnode with no cred")); p2->p_tracecred = crhold(p1->p_tracecred); } } mtx_unlock(&ktrace_mtx); #endif /* * If PF_FORK is set, the child process inherits the * procfs ioctl flags from its parent. */ if (p1->p_pfsflags & PF_FORK) { p2->p_stops = p1->p_stops; p2->p_pfsflags = p1->p_pfsflags; } /* * This begins the section where we must prevent the parent * from being swapped. */ _PHOLD(p1); PROC_UNLOCK(p1); /* * Attach the new process to its parent. * * If RFNOWAIT is set, the newly created process becomes a child * of init. This effectively disassociates the child from the * parent. */ if (flags & RFNOWAIT) pptr = initproc; else pptr = p1; p2->p_pptr = pptr; LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); sx_xunlock(&proctree_lock); /* Inform accounting that we have forked. */ p2->p_acflag = AFORK; PROC_UNLOCK(p2); /* * Finish creating the child process. It will return via a different * execution path later. (ie: directly into user mode) */ - vm_forkproc(td, p2, td2, flags); + vm_forkproc(td, p2, td2, vm2, flags); if (flags == (RFFDG | RFPROC)) { PCPU_INC(cnt.v_forks); PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) { PCPU_INC(cnt.v_vforks); PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (p1 == &proc0) { PCPU_INC(cnt.v_kthreads); PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else { PCPU_INC(cnt.v_rforks); PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } /* * Both processes are set up, now check if any loadable modules want * to adjust anything. * What if they have an error? XXX */ EVENTHANDLER_INVOKE(process_fork, p1, p2, flags); /* * Set the child start time and mark the process as being complete. */ microuptime(&p2->p_stats->p_start); PROC_SLOCK(p2); p2->p_state = PRS_NORMAL; PROC_SUNLOCK(p2); /* * If RFSTOPPED not requested, make child runnable and add to * run queue. */ if ((flags & RFSTOPPED) == 0) { thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); } /* * Now can be swapped. */ PROC_LOCK(p1); _PRELE(p1); /* * Tell any interested parties about the new process. */ KNOTE_LOCKED(&p1->p_klist, NOTE_FORK | p2->p_pid); PROC_UNLOCK(p1); /* * Preserve synchronization semantics of vfork. If waiting for * child to exec or exit, set P_PPWAIT on child, and sleep on our * proc (in case of exit). */ PROC_LOCK(p2); while (p2->p_flag & P_PPWAIT) msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0); PROC_UNLOCK(p2); /* * Return child proc pointer to parent. */ *procp = p2; return (0); fail: sx_sunlock(&proctree_lock); if (ppsratecheck(&lastfail, &curfail, 1)) printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n", td->td_ucred->cr_ruid); sx_xunlock(&allproc_lock); #ifdef MAC mac_proc_destroy(newproc); #endif +fail1: uma_zfree(proc_zone, newproc); pause("fork", hz / 2); return (error); } /* * Handle the return of a child process from fork1(). This function * is called from the MD fork_trampoline() entry point. */ void fork_exit(callout, arg, frame) void (*callout)(void *, struct trapframe *); void *arg; struct trapframe *frame; { struct proc *p; struct thread *td; struct thread *dtd; td = curthread; p = td->td_proc; KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new")); CTR4(KTR_PROC, "fork_exit: new thread %p (kse %p, pid %d, %s)", td, td->td_sched, p->p_pid, p->p_comm); sched_fork_exit(td); /* * Processes normally resume in mi_switch() after being * cpu_switch()'ed to, but when children start up they arrive here * instead, so we must do much the same things as mi_switch() would. */ if ((dtd = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(dtd); } thread_unlock(td); /* * cpu_set_fork_handler intercepts this function call to * have this call a non-return function to stay in kernel mode. * initproc has its own fork handler, but it does return. */ KASSERT(callout != NULL, ("NULL callout in fork_exit")); callout(arg, frame); /* * Check if a kernel thread misbehaved and returned from its main * function. */ if (p->p_flag & P_KTHREAD) { printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n", p->p_comm, p->p_pid); kproc_exit(0); } mtx_assert(&Giant, MA_NOTOWNED); EVENTHANDLER_INVOKE(schedtail, p); } /* * Simplified back end of syscall(), used when returning from fork() * directly into user mode. Giant is not held on entry, and must not * be held on return. This function is passed in to fork_exit() as the * first parameter and is called when returning to a new userland process. */ void fork_return(td, frame) struct thread *td; struct trapframe *frame; { userret(td, frame); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(SYS_fork, 0, 0); #endif mtx_assert(&Giant, MA_NOTOWNED); } Index: head/sys/kern/kern_kse.c =================================================================== --- head/sys/kern/kern_kse.c (revision 173360) +++ head/sys/kern/kern_kse.c (revision 173361) @@ -1,1427 +1,1444 @@ /*- * Copyright (C) 2001 Julian Elischer . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #ifdef KSE static uma_zone_t upcall_zone; /* DEBUG ONLY */ extern int virtual_cpu; extern int thread_debug; extern int max_threads_per_proc; extern int max_groups_per_proc; extern int max_threads_hits; extern struct mtx kse_lock; TAILQ_HEAD(, kse_upcall) zombie_upcalls = TAILQ_HEAD_INITIALIZER(zombie_upcalls); static int thread_update_usr_ticks(struct thread *td); -static void thread_alloc_spare(struct thread *td); +static int thread_alloc_spare(struct thread *td); static struct thread *thread_schedule_upcall(struct thread *td, struct kse_upcall *ku); static struct kse_upcall *upcall_alloc(void); struct mtx kse_lock; MTX_SYSINIT(kse_lock, &kse_lock, "kse lock", MTX_SPIN); struct kse_upcall * upcall_alloc(void) { struct kse_upcall *ku; ku = uma_zalloc(upcall_zone, M_WAITOK | M_ZERO); return (ku); } void upcall_reap(void) { TAILQ_HEAD(, kse_upcall) zupcalls; struct kse_upcall *ku_item, *ku_tmp; TAILQ_INIT(&zupcalls); mtx_lock_spin(&kse_lock); if (!TAILQ_EMPTY(&zombie_upcalls)) { TAILQ_CONCAT(&zupcalls, &zombie_upcalls, ku_link); TAILQ_INIT(&zombie_upcalls); } mtx_unlock_spin(&kse_lock); TAILQ_FOREACH_SAFE(ku_item, &zupcalls, ku_link, ku_tmp) uma_zfree(upcall_zone, ku_item); } void upcall_remove(struct thread *td) { PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_upcall != NULL) { /* * If we are not a bound thread then decrement the count of * possible upcall sources */ if (td->td_pflags & TDP_SA) td->td_proc->p_numupcalls--; mtx_lock_spin(&kse_lock); td->td_upcall->ku_owner = NULL; TAILQ_REMOVE(&td->td_upcall->ku_proc->p_upcalls, td->td_upcall, ku_link); TAILQ_INSERT_HEAD(&zombie_upcalls, td->td_upcall, ku_link); mtx_unlock_spin(&kse_lock); td->td_upcall = NULL; } } #endif #ifndef _SYS_SYSPROTO_H_ struct kse_switchin_args { struct kse_thr_mailbox *tmbx; int flags; }; #endif #ifdef KSE void kse_unlink(struct thread *td) { mtx_lock_spin(&kse_lock); thread_unlink(td); mtx_unlock_spin(&kse_lock); upcall_remove(td); } #endif int kse_switchin(struct thread *td, struct kse_switchin_args *uap) { #ifdef KSE struct kse_thr_mailbox tmbx; struct kse_upcall *ku; int error; thread_lock(td); if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td)) { thread_unlock(td); return (EINVAL); } thread_unlock(td); error = (uap->tmbx == NULL) ? EINVAL : 0; if (!error) error = copyin(uap->tmbx, &tmbx, sizeof(tmbx)); if (!error && (uap->flags & KSE_SWITCHIN_SETTMBX)) error = (suword(&ku->ku_mailbox->km_curthread, (long)uap->tmbx) != 0 ? EINVAL : 0); if (!error) error = set_mcontext(td, &tmbx.tm_context.uc_mcontext); if (!error) { suword32(&uap->tmbx->tm_lwp, td->td_tid); if (uap->flags & KSE_SWITCHIN_SETTMBX) { td->td_mailbox = uap->tmbx; td->td_pflags |= TDP_CAN_UNBIND; } PROC_LOCK(td->td_proc); if (td->td_proc->p_flag & P_TRACED) { _PHOLD(td->td_proc); if (tmbx.tm_dflags & TMDF_SSTEP) ptrace_single_step(td); else ptrace_clear_single_step(td); if (tmbx.tm_dflags & TMDF_SUSPEND) { thread_lock(td); /* fuword can block, check again */ if (td->td_upcall) ku->ku_flags |= KUF_DOUPCALL; thread_unlock(td); } _PRELE(td->td_proc); } PROC_UNLOCK(td->td_proc); } return ((error == 0) ? EJUSTRETURN : error); #else /* !KSE */ return (EOPNOTSUPP); #endif } /* struct kse_thr_interrupt_args { struct kse_thr_mailbox * tmbx; int cmd; long data; }; */ int kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap) { #ifdef KSE struct kse_execve_args args; struct image_args iargs; struct proc *p; struct thread *td2; struct kse_upcall *ku; struct kse_thr_mailbox *tmbx; uint32_t flags; int error; p = td->td_proc; PROC_LOCK(p); if (!(p->p_flag & P_SA)) { PROC_UNLOCK(p); return (EINVAL); } PROC_UNLOCK(p); switch (uap->cmd) { case KSE_INTR_SENDSIG: if (uap->data < 0 || uap->data > _SIG_MAXSIG) return (EINVAL); case KSE_INTR_INTERRUPT: case KSE_INTR_RESTART: PROC_LOCK(p); PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td2) { if (td2->td_mailbox == uap->tmbx) break; } if (td2 == NULL) { PROC_SUNLOCK(p); PROC_UNLOCK(p); return (ESRCH); } thread_lock(td2); PROC_SUNLOCK(p); if (uap->cmd == KSE_INTR_SENDSIG) { if (uap->data > 0) { td2->td_flags &= ~TDF_INTERRUPT; thread_unlock(td2); tdsignal(p, td2, (int)uap->data, NULL); } else { thread_unlock(td2); } } else { td2->td_flags |= TDF_INTERRUPT | TDF_ASTPENDING; if (TD_CAN_UNBIND(td2)) td2->td_upcall->ku_flags |= KUF_DOUPCALL; if (uap->cmd == KSE_INTR_INTERRUPT) td2->td_intrval = EINTR; else td2->td_intrval = ERESTART; if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) sleepq_abort(td2, td2->td_intrval); thread_unlock(td2); } PROC_UNLOCK(p); break; case KSE_INTR_SIGEXIT: if (uap->data < 1 || uap->data > _SIG_MAXSIG) return (EINVAL); PROC_LOCK(p); sigexit(td, (int)uap->data); break; case KSE_INTR_DBSUSPEND: /* this sub-function is only for bound thread */ if (td->td_pflags & TDP_SA) return (EINVAL); thread_lock(td); ku = td->td_upcall; thread_unlock(td); tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread); if (tmbx == NULL || tmbx == (void *)-1) return (EINVAL); flags = 0; PROC_LOCK(p); while ((p->p_flag & P_TRACED) && !(p->p_flag & P_SINGLE_EXIT)) { flags = fuword32(&tmbx->tm_dflags); if (!(flags & TMDF_SUSPEND)) break; PROC_SLOCK(p); thread_stopped(p); PROC_UNLOCK(p); thread_lock(td); thread_suspend_one(td); PROC_SUNLOCK(p); mi_switch(SW_VOL, NULL); thread_unlock(td); PROC_LOCK(p); } PROC_UNLOCK(p); return (0); case KSE_INTR_EXECVE: error = copyin((void *)uap->data, &args, sizeof(args)); if (error) return (error); error = exec_copyin_args(&iargs, args.path, UIO_USERSPACE, args.argv, args.envp); if (error == 0) error = kern_execve(td, &iargs, NULL); if (error == 0) { PROC_LOCK(p); SIGSETOR(td->td_siglist, args.sigpend); PROC_UNLOCK(p); kern_sigprocmask(td, SIG_SETMASK, &args.sigmask, NULL, 0); } return (error); default: return (EINVAL); } return (0); #else /* !KSE */ return (EOPNOTSUPP); #endif } /* struct kse_exit_args { register_t dummy; }; */ int kse_exit(struct thread *td, struct kse_exit_args *uap) { #ifdef KSE struct proc *p; struct kse_upcall *ku, *ku2; int error, count; p = td->td_proc; /* * Ensure that this is only called from the UTS */ thread_lock(td); if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td)) { thread_unlock(td); return (EINVAL); } thread_unlock(td); /* * Calculate the existing non-exiting upcalls in this process. * If we are the last upcall but there are still other threads, * then do not exit. We need the other threads to be able to * complete whatever they are doing. * XXX This relies on the userland knowing what to do if we return. * It may be a better choice to convert ourselves into a kse_release * ( or similar) and wait in the kernel to be needed. * XXX Where are those other threads? I suppose they are waiting in * the kernel. We should wait for them all at the user boundary after * turning into an exit. */ count = 0; PROC_LOCK(p); PROC_SLOCK(p); FOREACH_UPCALL_IN_PROC(p, ku2) { if ((ku2->ku_flags & KUF_EXITING) == 0) count++; } if (count == 1 && (p->p_numthreads > 1)) { PROC_SUNLOCK(p); PROC_UNLOCK(p); return (EDEADLK); } ku->ku_flags |= KUF_EXITING; PROC_SUNLOCK(p); PROC_UNLOCK(p); /* * Mark the UTS mailbox as having been finished with. * If that fails then just go for a segfault. * XXX need to check it that can be deliverred without a mailbox. */ error = suword32(&ku->ku_mailbox->km_flags, ku->ku_mflags|KMF_DONE); if (!(td->td_pflags & TDP_SA)) if (suword32(&td->td_mailbox->tm_lwp, 0)) error = EFAULT; PROC_LOCK(p); if (error) psignal(p, SIGSEGV); sigqueue_flush(&td->td_sigqueue); PROC_SLOCK(p); thread_lock(td); upcall_remove(td); thread_unlock(td); if (p->p_numthreads != 1) { thread_stopped(p); thread_exit(); /* NOTREACHED */ } /* * This is the last thread. Just return to the user. * Effectively we have left threading mode.. * The only real thing left to do is ensure that the * scheduler sets out concurrency back to 1 as that may be a * resource leak otherwise. * This is an A[PB]I issue.. what SHOULD we do? * One possibility is to return to the user. It may not cope well. * The other possibility would be to let the process exit. */ thread_unthread(td); PROC_SUNLOCK(p); PROC_UNLOCK(p); #if 0 return (0); #else printf("kse_exit: called on last thread. Calling exit1()"); exit1(td, 0); #endif #else /* !KSE */ return (EOPNOTSUPP); #endif } /* * Either becomes an upcall or waits for an awakening event and * then becomes an upcall. Only error cases return. */ /* struct kse_release_args { struct timespec *timeout; }; */ int kse_release(struct thread *td, struct kse_release_args *uap) { #ifdef KSE struct proc *p; struct kse_upcall *ku; struct timespec timeout; struct timeval tv; sigset_t sigset; int error; p = td->td_proc; thread_lock(td); if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td)) { thread_unlock(td); printf("kse_release: called outside of threading. exiting"); exit1(td, 0); } thread_unlock(td); if (uap->timeout != NULL) { if ((error = copyin(uap->timeout, &timeout, sizeof(timeout)))) return (error); TIMESPEC_TO_TIMEVAL(&tv, &timeout); } if (td->td_pflags & TDP_SA) td->td_pflags |= TDP_UPCALLING; else { ku->ku_mflags = fuword32(&ku->ku_mailbox->km_flags); if (ku->ku_mflags == -1) { PROC_LOCK(p); sigexit(td, SIGSEGV); } } PROC_LOCK(p); if (ku->ku_mflags & KMF_WAITSIGEVENT) { /* UTS wants to wait for signal event */ if (!(p->p_flag & P_SIGEVENT) && !(ku->ku_flags & KUF_DOUPCALL)) { td->td_kflags |= TDK_KSERELSIG; error = msleep(&p->p_siglist, &p->p_mtx, PPAUSE|PCATCH, "ksesigwait", (uap->timeout ? tvtohz(&tv) : 0)); td->td_kflags &= ~(TDK_KSERELSIG | TDK_WAKEUP); } p->p_flag &= ~P_SIGEVENT; sigset = p->p_siglist; PROC_UNLOCK(p); error = copyout(&sigset, &ku->ku_mailbox->km_sigscaught, sizeof(sigset)); } else { if ((ku->ku_flags & KUF_DOUPCALL) == 0 && ((ku->ku_mflags & KMF_NOCOMPLETED) || (p->p_completed == NULL))) { p->p_upsleeps++; td->td_kflags |= TDK_KSEREL; error = msleep(&p->p_completed, &p->p_mtx, PPAUSE|PCATCH, "kserel", (uap->timeout ? tvtohz(&tv) : 0)); td->td_kflags &= ~(TDK_KSEREL | TDK_WAKEUP); p->p_upsleeps--; } PROC_UNLOCK(p); } if (ku->ku_flags & KUF_DOUPCALL) { PROC_SLOCK(p); ku->ku_flags &= ~KUF_DOUPCALL; PROC_SUNLOCK(p); } return (0); #else /* !KSE */ return (EOPNOTSUPP); #endif } /* struct kse_wakeup_args { struct kse_mailbox *mbx; }; */ int kse_wakeup(struct thread *td, struct kse_wakeup_args *uap) { #ifdef KSE struct proc *p; struct kse_upcall *ku; struct thread *td2; p = td->td_proc; td2 = NULL; ku = NULL; /* KSE-enabled processes only, please. */ PROC_LOCK(p); if (!(p->p_flag & P_SA)) { PROC_UNLOCK(p); return (EINVAL); } PROC_SLOCK(p); if (uap->mbx) { FOREACH_UPCALL_IN_PROC(p, ku) { if (ku->ku_mailbox == uap->mbx) break; } } else { if (p->p_upsleeps) { PROC_SUNLOCK(p); wakeup(&p->p_completed); PROC_UNLOCK(p); return (0); } ku = TAILQ_FIRST(&p->p_upcalls); } if (ku == NULL) { PROC_SUNLOCK(p); PROC_UNLOCK(p); return (ESRCH); } mtx_lock_spin(&kse_lock); if ((td2 = ku->ku_owner) == NULL) { mtx_unlock_spin(&kse_lock); PROC_SUNLOCK(p); PROC_UNLOCK(p); panic("%s: no owner", __func__); } else if (td2->td_kflags & (TDK_KSEREL | TDK_KSERELSIG)) { mtx_unlock_spin(&kse_lock); if (!(td2->td_kflags & TDK_WAKEUP)) { td2->td_kflags |= TDK_WAKEUP; if (td2->td_kflags & TDK_KSEREL) sleepq_remove(td2, &p->p_completed); else sleepq_remove(td2, &p->p_siglist); } } else { ku->ku_flags |= KUF_DOUPCALL; mtx_unlock_spin(&kse_lock); } PROC_SUNLOCK(p); PROC_UNLOCK(p); return (0); #else /* !KSE */ return (EOPNOTSUPP); #endif } /* * newgroup == 0: first call: use current KSE, don't schedule an upcall * All other situations, do allocate max new KSEs and schedule an upcall. * * XXX should be changed so that 'first' behaviour lasts for as long * as you have not made a thread in this proc. i.e. as long as we do not have * a mailbox.. */ /* struct kse_create_args { struct kse_mailbox *mbx; int newgroup; }; */ int kse_create(struct thread *td, struct kse_create_args *uap) { #ifdef KSE struct proc *p; struct kse_mailbox mbx; struct kse_upcall *newku; int err, ncpus, sa = 0, first = 0; struct thread *newtd; p = td->td_proc; /* * Processes using the other threading model can't * suddenly start calling this one * XXX maybe... */ PROC_LOCK(p); if ((p->p_flag & (P_SA|P_HADTHREADS)) == P_HADTHREADS) { PROC_UNLOCK(p); return (EINVAL); } if (!(p->p_flag & P_SA)) { first = 1; p->p_flag |= P_SA|P_HADTHREADS; } PROC_UNLOCK(p); if ((err = copyin(uap->mbx, &mbx, sizeof(mbx)))) return (err); ncpus = mp_ncpus; if (virtual_cpu != 0) ncpus = virtual_cpu; /* * If the new UTS mailbox says that this * will be a BOUND lwp, then it had better * have its thread mailbox already there. */ if ((mbx.km_flags & KMF_BOUND) || uap->newgroup) { /* It's a bound thread (1:1) */ if (mbx.km_curthread == NULL) return (EINVAL); ncpus = 1; if (!(uap->newgroup || first)) return (EINVAL); } else { /* It's an upcall capable thread */ sa = TDP_SA; PROC_LOCK(p); /* * Limit it to NCPU upcall contexts per proc in any case. * numupcalls will soon be numkse or something * as it will represent the number of * non-bound upcalls available. (i.e. ones that can * actually call up). */ if (p->p_numupcalls >= ncpus) { PROC_UNLOCK(p); return (EPROCLIM); } p->p_numupcalls++; PROC_UNLOCK(p); } + /* + * For the first call this may not have been set. + * Of course nor may it actually be needed. + * thread_schedule_upcall() will look for it. + */ + if (td->td_standin == NULL) { + if (!thread_alloc_spare(td)) + return (ENOMEM); + } + /* * Even bound LWPs get a mailbox and an upcall to hold it. * XXX This should change. */ newku = upcall_alloc(); newku->ku_mailbox = uap->mbx; newku->ku_func = mbx.km_func; bcopy(&mbx.km_stack, &newku->ku_stack, sizeof(stack_t)); - /* - * For the first call this may not have been set. - * Of course nor may it actually be needed. - * thread_schedule_upcall() will look for it. - */ - if (td->td_standin == NULL) - thread_alloc_spare(td); PROC_LOCK(p); PROC_SLOCK(p); /* * If we are the first time, and a normal thread, * then transfer all the signals back to the 'process'. * SA threading will make a special thread to handle them. */ if (first) { sigqueue_move_set(&td->td_sigqueue, &p->p_sigqueue, &td->td_sigqueue.sq_signals); SIGFILLSET(td->td_sigmask); SIG_CANTMASK(td->td_sigmask); } /* * Make the new upcall available to the process. * It may or may not use it, but it's available. */ TAILQ_INSERT_TAIL(&p->p_upcalls, newku, ku_link); newku->ku_proc = p; PROC_UNLOCK(p); if (mbx.km_quantum) /* XXX should this be in the thread? */ p->p_upquantum = max(1, mbx.km_quantum / tick); /* * Each upcall structure has an owner thread, find which * one owns it. */ thread_lock(td); mtx_lock_spin(&kse_lock); if (uap->newgroup) { /* * The newgroup parameter now means * "bound, non SA, system scope" * It is only used for the interrupt thread at the * moment I think.. (or system scope threads dopey). * We'll rename it later. */ newtd = thread_schedule_upcall(td, newku); } else { /* * If the current thread hasn't an upcall structure, * just assign the upcall to it. * It'll just return. */ if (td->td_upcall == NULL) { newku->ku_owner = td; td->td_upcall = newku; newtd = td; } else { /* * Create a new upcall thread to own it. */ newtd = thread_schedule_upcall(td, newku); } } mtx_unlock_spin(&kse_lock); thread_unlock(td); PROC_SUNLOCK(p); /* * Let the UTS instance know its LWPID. * It doesn't really care. But the debugger will. * XXX warning.. remember that this moves. */ suword32(&newku->ku_mailbox->km_lwp, newtd->td_tid); /* * In the same manner, if the UTS has a current user thread, * then it is also running on this LWP so set it as well. * The library could do that of course.. but why not.. * XXX I'm not sure this can ever happen but ... * XXX does the UTS ever set this in the mailbox before calling this? */ if (mbx.km_curthread) suword32(&mbx.km_curthread->tm_lwp, newtd->td_tid); if (sa) { newtd->td_pflags |= TDP_SA; /* * If we are starting a new thread, kick it off. */ if (newtd != td) { thread_lock(newtd); sched_add(newtd, SRQ_BORING); thread_unlock(newtd); } } else { newtd->td_pflags &= ~TDP_SA; /* * Since a library will use the mailbox pointer to * identify even a bound thread, and the mailbox pointer * will never be allowed to change after this syscall * for a bound thread, set it here so the library can * find the thread after the syscall returns. */ newtd->td_mailbox = mbx.km_curthread; if (newtd != td) { /* * If we did create a new thread then * make sure it goes to the right place * when it starts up, and make sure that it runs * at full speed when it gets there. * thread_schedule_upcall() copies all cpu state * to the new thread, so we should clear single step * flag here. */ cpu_set_upcall_kse(newtd, newku->ku_func, newku->ku_mailbox, &newku->ku_stack); PROC_LOCK(p); if (p->p_flag & P_TRACED) { _PHOLD(p); ptrace_clear_single_step(newtd); _PRELE(p); } PROC_UNLOCK(p); thread_lock(newtd); sched_add(newtd, SRQ_BORING); thread_unlock(newtd); } } return (0); #else /* !KSE */ return (EOPNOTSUPP); #endif } #ifdef KSE /* * Initialize global thread allocation resources. */ void kseinit(void) { upcall_zone = uma_zcreate("UPCALL", sizeof(struct kse_upcall), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); } /* * Store the thread context in the UTS's mailbox. * then add the mailbox at the head of a list we are building in user space. * The list is anchored in the proc structure. */ int thread_export_context(struct thread *td, int willexit) { struct proc *p; uintptr_t mbx; void *addr; int error = 0, sig; mcontext_t mc; p = td->td_proc; /* * Post sync signal, or process SIGKILL and SIGSTOP. * For sync signal, it is only possible when the signal is not * caught by userland or process is being debugged. */ PROC_LOCK(p); if (td->td_flags & TDF_NEEDSIGCHK) { thread_lock(td); td->td_flags &= ~TDF_NEEDSIGCHK; thread_unlock(td); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) postsig(sig); mtx_unlock(&p->p_sigacts->ps_mtx); } if (willexit) SIGFILLSET(td->td_sigmask); PROC_UNLOCK(p); /* Export the user/machine context. */ get_mcontext(td, &mc, 0); addr = (void *)(&td->td_mailbox->tm_context.uc_mcontext); error = copyout(&mc, addr, sizeof(mcontext_t)); if (error) goto bad; addr = (caddr_t)(&td->td_mailbox->tm_lwp); if (suword32(addr, 0)) { error = EFAULT; goto bad; } /* Get address in latest mbox of list pointer */ addr = (void *)(&td->td_mailbox->tm_next); /* * Put the saved address of the previous first * entry into this one */ for (;;) { mbx = (uintptr_t)p->p_completed; if (suword(addr, mbx)) { error = EFAULT; goto bad; } PROC_LOCK(p); if (mbx == (uintptr_t)p->p_completed) { thread_lock(td); p->p_completed = td->td_mailbox; /* * The thread context may be taken away by * other upcall threads when we unlock * process lock. it's no longer valid to * use it again in any other places. */ td->td_mailbox = NULL; thread_unlock(td); PROC_UNLOCK(p); break; } PROC_UNLOCK(p); } td->td_usticks = 0; return (0); bad: PROC_LOCK(p); sigexit(td, SIGILL); return (error); } /* * Take the list of completed mailboxes for this Process and put them on this * upcall's mailbox as it's the next one going up. */ static int thread_link_mboxes(struct proc *p, struct kse_upcall *ku) { void *addr; uintptr_t mbx; addr = (void *)(&ku->ku_mailbox->km_completed); for (;;) { mbx = (uintptr_t)p->p_completed; if (suword(addr, mbx)) { PROC_LOCK(p); psignal(p, SIGSEGV); PROC_UNLOCK(p); return (EFAULT); } PROC_LOCK(p); if (mbx == (uintptr_t)p->p_completed) { p->p_completed = NULL; PROC_UNLOCK(p); break; } PROC_UNLOCK(p); } return (0); } /* * This function should be called at statclock interrupt time */ int thread_statclock(int user) { struct thread *td = curthread; if (!(td->td_pflags & TDP_SA)) return (0); if (user) { /* Current always do via ast() */ thread_lock(td); td->td_flags |= TDF_ASTPENDING; thread_unlock(td); td->td_uuticks++; } else if (td->td_mailbox != NULL) td->td_usticks++; return (0); } /* * Export state clock ticks for userland */ static int thread_update_usr_ticks(struct thread *td) { struct proc *p = td->td_proc; caddr_t addr; u_int uticks; thread_lock(td); if (td->td_mailbox == NULL) { thread_unlock(td); return (-1); } thread_unlock(td); if ((uticks = td->td_uuticks) != 0) { td->td_uuticks = 0; addr = (caddr_t)&td->td_mailbox->tm_uticks; if (suword32(addr, uticks+fuword32(addr))) goto error; } if ((uticks = td->td_usticks) != 0) { td->td_usticks = 0; addr = (caddr_t)&td->td_mailbox->tm_sticks; if (suword32(addr, uticks+fuword32(addr))) goto error; } return (0); error: PROC_LOCK(p); psignal(p, SIGSEGV); PROC_UNLOCK(p); return (-2); } /* * This function is intended to be used to initialize a spare thread * for upcall. Initialize thread's large data area outside the thread lock * for thread_schedule_upcall(). The crhold is also here to get it out * from the schedlock as it has a mutex op itself. * XXX BUG.. we need to get the cr ref after the thread has * checked and chenged its own, not 6 months before... */ -void +int thread_alloc_spare(struct thread *td) { struct thread *spare; if (td->td_standin) - return; + return (1); spare = thread_alloc(); + if (spare == NULL) + return (0); td->td_standin = spare; bzero(&spare->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); spare->td_proc = td->td_proc; spare->td_ucred = crhold(td->td_ucred); spare->td_flags = TDF_INMEM; + return (1); } /* * Create a thread and schedule it for upcall on the KSE given. * Use our thread's standin so that we don't have to allocate one. */ struct thread * thread_schedule_upcall(struct thread *td, struct kse_upcall *ku) { struct thread *td2; THREAD_LOCK_ASSERT(td, MA_OWNED); mtx_assert(&kse_lock, MA_OWNED); /* * Schedule an upcall thread on specified kse_upcall, * the kse_upcall must be free. * td must have a spare thread. */ KASSERT(ku->ku_owner == NULL, ("%s: upcall has owner", __func__)); if ((td2 = td->td_standin) != NULL) { td->td_standin = NULL; } else { panic("no reserve thread when scheduling an upcall"); return (NULL); } CTR3(KTR_PROC, "thread_schedule_upcall: thread %p (pid %d, %s)", td2, td->td_proc->p_pid, td->td_proc->p_comm); /* * Bzero already done in thread_alloc_spare() because we can't * do the crhold here because we are in schedlock already. */ bcopy(&td->td_startcopy, &td2->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); sched_fork_thread(td, td2); thread_link(td2, ku->ku_proc); /* inherit parts of blocked thread's context as a good template */ cpu_set_upcall(td2, td); /* Let the new thread become owner of the upcall */ ku->ku_owner = td2; td2->td_upcall = ku; td2->td_pflags = TDP_SA|TDP_UPCALLING; td2->td_state = TDS_CAN_RUN; td2->td_inhibitors = 0; SIGFILLSET(td2->td_sigmask); SIG_CANTMASK(td2->td_sigmask); return (td2); /* bogus.. should be a void function */ } /* * It is only used when thread generated a trap and process is being * debugged. */ void thread_signal_add(struct thread *td, ksiginfo_t *ksi) { struct proc *p; struct sigacts *ps; int error; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); mtx_unlock(&ps->ps_mtx); SIGADDSET(td->td_sigmask, ksi->ksi_signo); PROC_UNLOCK(p); error = copyout(&ksi->ksi_info, &td->td_mailbox->tm_syncsig, sizeof(siginfo_t)); if (error) { PROC_LOCK(p); sigexit(td, SIGSEGV); } PROC_LOCK(p); mtx_lock(&ps->ps_mtx); } #include "opt_sched.h" struct thread * thread_switchout(struct thread *td, int flags, struct thread *nextthread) { struct kse_upcall *ku; struct thread *td2; THREAD_LOCK_ASSERT(td, MA_OWNED); /* * If the outgoing thread is in threaded group and has never * scheduled an upcall, decide whether this is a short * or long term event and thus whether or not to schedule * an upcall. * If it is a short term event, just suspend it in * a way that takes its KSE with it. * Select the events for which we want to schedule upcalls. * For now it's just sleep or if thread is suspended but * process wide suspending flag is not set (debugger * suspends thread). * XXXKSE eventually almost any inhibition could do. */ if (TD_CAN_UNBIND(td) && (td->td_standin) && (TD_ON_SLEEPQ(td) || (TD_IS_SUSPENDED(td) && !P_SHOULDSTOP(td->td_proc)))) { /* * Release ownership of upcall, and schedule an upcall * thread, this new upcall thread becomes the owner of * the upcall structure. It will be ahead of us in the * run queue, so as we are stopping, it should either * start up immediatly, or at least before us if * we release our slot. */ mtx_lock_spin(&kse_lock); ku = td->td_upcall; ku->ku_owner = NULL; td->td_upcall = NULL; td->td_pflags &= ~TDP_CAN_UNBIND; td2 = thread_schedule_upcall(td, ku); mtx_unlock_spin(&kse_lock); if (flags & SW_INVOL || nextthread) { thread_lock(td2); sched_add(td2, SRQ_YIELDING); thread_unlock(td2); } else { /* Keep up with reality.. we have one extra thread * in the picture.. and it's 'running'. */ return td2; } } return (nextthread); } /* * Setup done on the thread when it enters the kernel. */ void thread_user_enter(struct thread *td) { struct proc *p = td->td_proc; struct kse_upcall *ku; struct kse_thr_mailbox *tmbx; uint32_t flags; /* * First check that we shouldn't just abort. we * can suspend it here or just exit. */ if (__predict_false(P_SHOULDSTOP(p))) { PROC_LOCK(p); thread_suspend_check(0); PROC_UNLOCK(p); } if (!(td->td_pflags & TDP_SA)) return; /* * If we are doing a syscall in a KSE environment, * note where our mailbox is. */ thread_lock(td); ku = td->td_upcall; thread_unlock(td); KASSERT(ku != NULL, ("no upcall owned")); KASSERT(ku->ku_owner == td, ("wrong owner")); KASSERT(!TD_CAN_UNBIND(td), ("can unbind")); - if (td->td_standin == NULL) - thread_alloc_spare(td); + if (td->td_standin == NULL) { + if (!thread_alloc_spare(td)) { + PROC_LOCK(p); + if (kern_logsigexit) + log(LOG_INFO, + "pid %d (%s), uid %d: thread_alloc_spare failed\n", + p->p_pid, p->p_comm, + td->td_ucred ? td->td_ucred->cr_uid : -1); + sigexit(td, SIGSEGV); /* XXX ? */ + /* panic("thread_user_enter: thread_alloc_spare failed"); */ + } + } ku->ku_mflags = fuword32((void *)&ku->ku_mailbox->km_flags); tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread); if ((tmbx == NULL) || (tmbx == (void *)-1L) || (ku->ku_mflags & KMF_NOUPCALL)) { td->td_mailbox = NULL; } else { flags = fuword32(&tmbx->tm_flags); /* * On some architectures, TP register points to thread * mailbox but not points to kse mailbox, and userland * can not atomically clear km_curthread, but can * use TP register, and set TMF_NOUPCALL in thread * flag to indicate a critical region. */ if (flags & TMF_NOUPCALL) { td->td_mailbox = NULL; } else { td->td_mailbox = tmbx; td->td_pflags |= TDP_CAN_UNBIND; PROC_LOCK(p); if (__predict_false(p->p_flag & P_TRACED)) { flags = fuword32(&tmbx->tm_dflags); if (flags & TMDF_SUSPEND) { thread_lock(td); /* fuword can block, check again */ if (td->td_upcall) ku->ku_flags |= KUF_DOUPCALL; thread_unlock(td); } } PROC_UNLOCK(p); } } } /* * The extra work we go through if we are a threaded process when we * return to userland. * * If we are a KSE process and returning to user mode, check for * extra work to do before we return (e.g. for more syscalls * to complete first). If we were in a critical section, we should * just return to let it finish. Same if we were in the UTS (in * which case the mailbox's context's busy indicator will be set). * The only traps we suport will have set the mailbox. * We will clear it here. */ int thread_userret(struct thread *td, struct trapframe *frame) { struct kse_upcall *ku; struct proc *p; struct timespec ts; int error = 0, uts_crit; /* Nothing to do with bound thread */ if (!(td->td_pflags & TDP_SA)) return (0); /* * Update stat clock count for userland */ if (td->td_mailbox != NULL) { thread_update_usr_ticks(td); uts_crit = 0; } else { uts_crit = 1; } p = td->td_proc; thread_lock(td); ku = td->td_upcall; /* * Optimisation: * This thread has not started any upcall. * If there is no work to report other than ourself, * then it can return direct to userland. */ if (TD_CAN_UNBIND(td)) { thread_unlock(td); td->td_pflags &= ~TDP_CAN_UNBIND; if ((td->td_flags & TDF_NEEDSIGCHK) == 0 && (p->p_completed == NULL) && (ku->ku_flags & KUF_DOUPCALL) == 0 && (p->p_upquantum && ticks < p->p_nextupcall)) { nanotime(&ts); error = copyout(&ts, (caddr_t)&ku->ku_mailbox->km_timeofday, sizeof(ts)); td->td_mailbox = 0; ku->ku_mflags = 0; if (error) goto out; return (0); } thread_export_context(td, 0); /* * There is something to report, and we own an upcall * structure, we can go to userland. * Turn ourself into an upcall thread. */ td->td_pflags |= TDP_UPCALLING; } else if (td->td_mailbox && (ku == NULL)) { thread_unlock(td); thread_export_context(td, 1); PROC_LOCK(p); if (p->p_upsleeps) wakeup(&p->p_completed); WITNESS_WARN(WARN_PANIC, &p->p_mtx.lock_object, "thread exiting in userret"); sigqueue_flush(&td->td_sigqueue); PROC_SLOCK(p); thread_stopped(p); thread_exit(); /* NOTREACHED */ } else thread_unlock(td); KASSERT(ku != NULL, ("upcall is NULL")); KASSERT(TD_CAN_UNBIND(td) == 0, ("can unbind")); PROC_LOCK(p); PROC_SLOCK(p); if (p->p_numthreads > max_threads_per_proc) { max_threads_hits++; while (p->p_numthreads > max_threads_per_proc) { if (p->p_numupcalls >= max_threads_per_proc) break; PROC_SUNLOCK(p); if (msleep(&p->p_numthreads, &p->p_mtx, PPAUSE|PCATCH, "maxthreads", hz/10) != EWOULDBLOCK) { PROC_SLOCK(p); break; } else PROC_SLOCK(p); } } PROC_SUNLOCK(p); PROC_UNLOCK(p); if (td->td_pflags & TDP_UPCALLING) { uts_crit = 0; p->p_nextupcall = ticks + p->p_upquantum; /* * There is no more work to do and we are going to ride * this thread up to userland as an upcall. * Do the last parts of the setup needed for the upcall. */ CTR3(KTR_PROC, "userret: upcall thread %p (pid %d, %s)", td, td->td_proc->p_pid, td->td_proc->p_comm); td->td_pflags &= ~TDP_UPCALLING; if (ku->ku_flags & KUF_DOUPCALL) { PROC_SLOCK(p); ku->ku_flags &= ~KUF_DOUPCALL; PROC_SUNLOCK(p); } /* * Set user context to the UTS */ if (!(ku->ku_mflags & KMF_NOUPCALL)) { cpu_set_upcall_kse(td, ku->ku_func, ku->ku_mailbox, &ku->ku_stack); PROC_LOCK(p); if (p->p_flag & P_TRACED) { _PHOLD(p); ptrace_clear_single_step(td); _PRELE(p); } PROC_UNLOCK(p); error = suword32(&ku->ku_mailbox->km_lwp, td->td_tid); if (error) goto out; error = suword(&ku->ku_mailbox->km_curthread, 0); if (error) goto out; } /* * Unhook the list of completed threads. * anything that completes after this gets to * come in next time. * Put the list of completed thread mailboxes on * this KSE's mailbox. */ if (!(ku->ku_mflags & KMF_NOCOMPLETED) && (error = thread_link_mboxes(p, ku)) != 0) goto out; } if (!uts_crit) { nanotime(&ts); error = copyout(&ts, &ku->ku_mailbox->km_timeofday, sizeof(ts)); } out: if (error) { /* * Things are going to be so screwed we should just kill * the process. * how do we do that? */ PROC_LOCK(p); psignal(p, SIGSEGV); PROC_UNLOCK(p); } else { /* * Optimisation: * Ensure that we have a spare thread available, * for when we re-enter the kernel. */ if (td->td_standin == NULL) - thread_alloc_spare(td); + thread_alloc_spare(td); /* XXX care of failure ? */ } ku->ku_mflags = 0; td->td_mailbox = NULL; td->td_usticks = 0; return (error); /* go sync */ } /* * called after ptrace resumed a process, force all * virtual CPUs to schedule upcall for SA process, * because debugger may have changed something in userland, * we should notice UTS as soon as possible. */ void thread_continued(struct proc *p) { struct kse_upcall *ku; struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(P_SHOULDSTOP(p), ("process not stopped")); if (!(p->p_flag & P_SA)) return; if (p->p_flag & P_TRACED) { td = TAILQ_FIRST(&p->p_threads); if (td && (td->td_pflags & TDP_SA)) { FOREACH_UPCALL_IN_PROC(p, ku) { PROC_SLOCK(p); ku->ku_flags |= KUF_DOUPCALL; PROC_SUNLOCK(p); wakeup(&p->p_completed); } } } } #endif Index: head/sys/kern/kern_proc.c =================================================================== --- head/sys/kern/kern_proc.c (revision 173360) +++ head/sys/kern/kern_proc.c (revision 173361) @@ -1,1348 +1,1346 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #include #include #include #include #include MALLOC_DEFINE(M_PGRP, "pgrp", "process group header"); MALLOC_DEFINE(M_SESSION, "session", "session header"); static MALLOC_DEFINE(M_PROC, "proc", "Proc structures"); MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures"); static void doenterpgrp(struct proc *, struct pgrp *); static void orphanpg(struct pgrp *pg); static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp); static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp); static void pgadjustjobc(struct pgrp *pgrp, int entering); static void pgdelete(struct pgrp *); static int proc_ctor(void *mem, int size, void *arg, int flags); static void proc_dtor(void *mem, int size, void *arg); static int proc_init(void *mem, int size, int flags); static void proc_fini(void *mem, int size); /* * Other process lists */ struct pidhashhead *pidhashtbl; u_long pidhash; struct pgrphashhead *pgrphashtbl; u_long pgrphash; struct proclist allproc; struct proclist zombproc; struct sx allproc_lock; struct sx proctree_lock; struct mtx ppeers_lock; uma_zone_t proc_zone; uma_zone_t ithread_zone; int kstack_pages = KSTACK_PAGES; SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0, ""); CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE); /* * Initialize global process hashing structures. */ void procinit() { sx_init(&allproc_lock, "allproc"); sx_init(&proctree_lock, "proctree"); mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF); LIST_INIT(&allproc); LIST_INIT(&zombproc); pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash); pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash); proc_zone = uma_zcreate("PROC", sched_sizeof_proc(), proc_ctor, proc_dtor, proc_init, proc_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uihashinit(); } /* * Prepare a proc for use. */ static int proc_ctor(void *mem, int size, void *arg, int flags) { struct proc *p; p = (struct proc *)mem; return (0); } /* * Reclaim a proc after use. */ static void proc_dtor(void *mem, int size, void *arg) { struct proc *p; struct thread *td; /* INVARIANTS checks go here */ p = (struct proc *)mem; td = FIRST_THREAD_IN_PROC(p); + if (td != NULL) { #ifdef INVARIANTS - KASSERT((p->p_numthreads == 1), - ("bad number of threads in exiting process")); - KASSERT((td != NULL), ("proc_dtor: bad thread pointer")); - KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr")); + KASSERT((p->p_numthreads == 1), + ("bad number of threads in exiting process")); + KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr")); #endif - /* Dispose of an alternate kstack, if it exists. - * XXX What if there are more than one thread in the proc? - * The first thread in the proc is special and not - * freed, so you gotta do this here. - */ - if (((p->p_flag & P_KTHREAD) != 0) && (td->td_altkstack != 0)) - vm_thread_dispose_altkstack(td); + /* Dispose of an alternate kstack, if it exists. + * XXX What if there are more than one thread in the proc? + * The first thread in the proc is special and not + * freed, so you gotta do this here. + */ + if (((p->p_flag & P_KTHREAD) != 0) && (td->td_altkstack != 0)) + vm_thread_dispose_altkstack(td); + } if (p->p_ksi != NULL) KASSERT(! KSI_ONQ(p->p_ksi), ("SIGCHLD queue")); } /* * Initialize type-stable parts of a proc (when newly created). */ static int proc_init(void *mem, int size, int flags) { struct proc *p; - struct thread *td; p = (struct proc *)mem; p->p_sched = (struct p_sched *)&p[1]; - td = thread_alloc(); bzero(&p->p_mtx, sizeof(struct mtx)); mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE); + TAILQ_INIT(&p->p_threads); /* all threads in proc */ p->p_stats = pstats_alloc(); - proc_linkup(p, td); - sched_newproc(p, td); return (0); } /* * UMA should ensure that this function is never called. * Freeing a proc structure would violate type stability. */ static void proc_fini(void *mem, int size) { #ifdef notnow struct proc *p; p = (struct proc *)mem; pstats_free(p->p_stats); thread_free(FIRST_THREAD_IN_PROC(p)); mtx_destroy(&p->p_mtx); if (p->p_ksi != NULL) ksiginfo_free(p->p_ksi); #else panic("proc reclaimed"); #endif } /* * Is p an inferior of the current process? */ int inferior(p) register struct proc *p; { sx_assert(&proctree_lock, SX_LOCKED); for (; p != curproc; p = p->p_pptr) if (p->p_pid == 0) return (0); return (1); } /* * Locate a process by number; return only "live" processes -- i.e., neither * zombies nor newly born but incompletely initialized processes. By not * returning processes in the PRS_NEW state, we allow callers to avoid * testing for that condition to avoid dereferencing p_ucred, et al. */ struct proc * pfind(pid) register pid_t pid; { register struct proc *p; sx_slock(&allproc_lock); LIST_FOREACH(p, PIDHASH(pid), p_hash) if (p->p_pid == pid) { if (p->p_state == PRS_NEW) { p = NULL; break; } PROC_LOCK(p); break; } sx_sunlock(&allproc_lock); return (p); } /* * Locate a process group by number. * The caller must hold proctree_lock. */ struct pgrp * pgfind(pgid) register pid_t pgid; { register struct pgrp *pgrp; sx_assert(&proctree_lock, SX_LOCKED); LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) { if (pgrp->pg_id == pgid) { PGRP_LOCK(pgrp); return (pgrp); } } return (NULL); } /* * Create a new process group. * pgid must be equal to the pid of p. * Begin a new session if required. */ int enterpgrp(p, pgid, pgrp, sess) register struct proc *p; pid_t pgid; struct pgrp *pgrp; struct session *sess; { struct pgrp *pgrp2; sx_assert(&proctree_lock, SX_XLOCKED); KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL")); KASSERT(p->p_pid == pgid, ("enterpgrp: new pgrp and pid != pgid")); pgrp2 = pgfind(pgid); KASSERT(pgrp2 == NULL, ("enterpgrp: pgrp with pgid exists")); KASSERT(!SESS_LEADER(p), ("enterpgrp: session leader attempted setpgrp")); mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); if (sess != NULL) { /* * new session */ mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF); mtx_lock(&Giant); /* XXX TTY */ PROC_LOCK(p); p->p_flag &= ~P_CONTROLT; PROC_UNLOCK(p); PGRP_LOCK(pgrp); sess->s_leader = p; sess->s_sid = p->p_pid; sess->s_count = 1; sess->s_ttyvp = NULL; sess->s_ttyp = NULL; bcopy(p->p_session->s_login, sess->s_login, sizeof(sess->s_login)); pgrp->pg_session = sess; KASSERT(p == curproc, ("enterpgrp: mksession and p != curproc")); } else { mtx_lock(&Giant); /* XXX TTY */ pgrp->pg_session = p->p_session; SESS_LOCK(pgrp->pg_session); pgrp->pg_session->s_count++; SESS_UNLOCK(pgrp->pg_session); PGRP_LOCK(pgrp); } pgrp->pg_id = pgid; LIST_INIT(&pgrp->pg_members); /* * As we have an exclusive lock of proctree_lock, * this should not deadlock. */ LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash); pgrp->pg_jobc = 0; SLIST_INIT(&pgrp->pg_sigiolst); PGRP_UNLOCK(pgrp); mtx_unlock(&Giant); /* XXX TTY */ doenterpgrp(p, pgrp); return (0); } /* * Move p to an existing process group */ int enterthispgrp(p, pgrp) register struct proc *p; struct pgrp *pgrp; { sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED); KASSERT(pgrp->pg_session == p->p_session, ("%s: pgrp's session %p, p->p_session %p.\n", __func__, pgrp->pg_session, p->p_session)); KASSERT(pgrp != p->p_pgrp, ("%s: p belongs to pgrp.", __func__)); doenterpgrp(p, pgrp); return (0); } /* * Move p to a process group */ static void doenterpgrp(p, pgrp) struct proc *p; struct pgrp *pgrp; { struct pgrp *savepgrp; sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED); savepgrp = p->p_pgrp; /* * Adjust eligibility of affected pgrps to participate in job control. * Increment eligibility counts before decrementing, otherwise we * could reach 0 spuriously during the first call. */ fixjobc(p, pgrp, 1); fixjobc(p, p->p_pgrp, 0); mtx_lock(&Giant); /* XXX TTY */ PGRP_LOCK(pgrp); PGRP_LOCK(savepgrp); PROC_LOCK(p); LIST_REMOVE(p, p_pglist); p->p_pgrp = pgrp; PROC_UNLOCK(p); LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist); PGRP_UNLOCK(savepgrp); PGRP_UNLOCK(pgrp); mtx_unlock(&Giant); /* XXX TTY */ if (LIST_EMPTY(&savepgrp->pg_members)) pgdelete(savepgrp); } /* * remove process from process group */ int leavepgrp(p) register struct proc *p; { struct pgrp *savepgrp; sx_assert(&proctree_lock, SX_XLOCKED); savepgrp = p->p_pgrp; mtx_lock(&Giant); /* XXX TTY */ PGRP_LOCK(savepgrp); PROC_LOCK(p); LIST_REMOVE(p, p_pglist); p->p_pgrp = NULL; PROC_UNLOCK(p); PGRP_UNLOCK(savepgrp); mtx_unlock(&Giant); /* XXX TTY */ if (LIST_EMPTY(&savepgrp->pg_members)) pgdelete(savepgrp); return (0); } /* * delete a process group */ static void pgdelete(pgrp) register struct pgrp *pgrp; { struct session *savesess; sx_assert(&proctree_lock, SX_XLOCKED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pgid. */ funsetownlst(&pgrp->pg_sigiolst); mtx_lock(&Giant); /* XXX TTY */ PGRP_LOCK(pgrp); if (pgrp->pg_session->s_ttyp != NULL && pgrp->pg_session->s_ttyp->t_pgrp == pgrp) pgrp->pg_session->s_ttyp->t_pgrp = NULL; LIST_REMOVE(pgrp, pg_hash); savesess = pgrp->pg_session; SESSRELE(savesess); PGRP_UNLOCK(pgrp); mtx_destroy(&pgrp->pg_mtx); FREE(pgrp, M_PGRP); mtx_unlock(&Giant); /* XXX TTY */ } static void pgadjustjobc(pgrp, entering) struct pgrp *pgrp; int entering; { PGRP_LOCK(pgrp); if (entering) pgrp->pg_jobc++; else { --pgrp->pg_jobc; if (pgrp->pg_jobc == 0) orphanpg(pgrp); } PGRP_UNLOCK(pgrp); } /* * Adjust pgrp jobc counters when specified process changes process group. * We count the number of processes in each process group that "qualify" * the group for terminal job control (those with a parent in a different * process group of the same session). If that count reaches zero, the * process group becomes orphaned. Check both the specified process' * process group and that of its children. * entering == 0 => p is leaving specified group. * entering == 1 => p is entering specified group. */ void fixjobc(p, pgrp, entering) register struct proc *p; register struct pgrp *pgrp; int entering; { register struct pgrp *hispgrp; register struct session *mysession; sx_assert(&proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); /* * Check p's parent to see whether p qualifies its own process * group; if so, adjust count for p's process group. */ mysession = pgrp->pg_session; if ((hispgrp = p->p_pptr->p_pgrp) != pgrp && hispgrp->pg_session == mysession) pgadjustjobc(pgrp, entering); /* * Check this process' children to see whether they qualify * their process groups; if so, adjust counts for children's * process groups. */ LIST_FOREACH(p, &p->p_children, p_sibling) { hispgrp = p->p_pgrp; if (hispgrp == pgrp || hispgrp->pg_session != mysession) continue; PROC_LOCK(p); if (p->p_state == PRS_ZOMBIE) { PROC_UNLOCK(p); continue; } PROC_UNLOCK(p); pgadjustjobc(hispgrp, entering); } } /* * A process group has become orphaned; * if there are any stopped processes in the group, * hang-up all process in that group. */ static void orphanpg(pg) struct pgrp *pg; { register struct proc *p; PGRP_LOCK_ASSERT(pg, MA_OWNED); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (P_SHOULDSTOP(p)) { PROC_UNLOCK(p); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); psignal(p, SIGHUP); psignal(p, SIGCONT); PROC_UNLOCK(p); } return; } PROC_UNLOCK(p); } } void sessrele(struct session *s) { int i; SESS_LOCK(s); i = --s->s_count; SESS_UNLOCK(s); if (i == 0) { if (s->s_ttyp != NULL) ttyrel(s->s_ttyp); mtx_destroy(&s->s_mtx); FREE(s, M_SESSION); } } #include "opt_ddb.h" #ifdef DDB #include DB_SHOW_COMMAND(pgrpdump, pgrpdump) { register struct pgrp *pgrp; register struct proc *p; register int i; for (i = 0; i <= pgrphash; i++) { if (!LIST_EMPTY(&pgrphashtbl[i])) { printf("\tindx %d\n", i); LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) { printf( "\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n", (void *)pgrp, (long)pgrp->pg_id, (void *)pgrp->pg_session, pgrp->pg_session->s_count, (void *)LIST_FIRST(&pgrp->pg_members)); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { printf("\t\tpid %ld addr %p pgrp %p\n", (long)p->p_pid, (void *)p, (void *)p->p_pgrp); } } } } } #endif /* DDB */ /* * Clear kinfo_proc and fill in any information that is common * to all threads in the process. * Must be called with the target process locked. */ static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp) { struct thread *td0; struct tty *tp; struct session *sp; struct ucred *cred; struct sigacts *ps; bzero(kp, sizeof(*kp)); kp->ki_structsize = sizeof(*kp); kp->ki_paddr = p; PROC_LOCK_ASSERT(p, MA_OWNED); kp->ki_addr =/* p->p_addr; */0; /* XXXKSE */ kp->ki_args = p->p_args; kp->ki_textvp = p->p_textvp; #ifdef KTRACE kp->ki_tracep = p->p_tracevp; mtx_lock(&ktrace_mtx); kp->ki_traceflag = p->p_traceflag; mtx_unlock(&ktrace_mtx); #endif kp->ki_fd = p->p_fd; kp->ki_vmspace = p->p_vmspace; kp->ki_flag = p->p_flag; cred = p->p_ucred; if (cred) { kp->ki_uid = cred->cr_uid; kp->ki_ruid = cred->cr_ruid; kp->ki_svuid = cred->cr_svuid; /* XXX bde doesn't like KI_NGROUPS */ kp->ki_ngroups = min(cred->cr_ngroups, KI_NGROUPS); bcopy(cred->cr_groups, kp->ki_groups, kp->ki_ngroups * sizeof(gid_t)); kp->ki_rgid = cred->cr_rgid; kp->ki_svgid = cred->cr_svgid; /* If jailed(cred), emulate the old P_JAILED flag. */ if (jailed(cred)) { kp->ki_flag |= P_JAILED; /* If inside a jail, use 0 as a jail ID. */ if (!jailed(curthread->td_ucred)) kp->ki_jid = cred->cr_prison->pr_id; } } ps = p->p_sigacts; if (ps) { mtx_lock(&ps->ps_mtx); kp->ki_sigignore = ps->ps_sigignore; kp->ki_sigcatch = ps->ps_sigcatch; mtx_unlock(&ps->ps_mtx); } PROC_SLOCK(p); if (p->p_state != PRS_NEW && p->p_state != PRS_ZOMBIE && p->p_vmspace != NULL) { struct vmspace *vm = p->p_vmspace; kp->ki_size = vm->vm_map.size; kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/ FOREACH_THREAD_IN_PROC(p, td0) { if (!TD_IS_SWAPPED(td0)) kp->ki_rssize += td0->td_kstack_pages; if (td0->td_altkstack_obj != NULL) kp->ki_rssize += td0->td_altkstack_pages; } kp->ki_swrss = vm->vm_swrss; kp->ki_tsize = vm->vm_tsize; kp->ki_dsize = vm->vm_dsize; kp->ki_ssize = vm->vm_ssize; } else if (p->p_state == PRS_ZOMBIE) kp->ki_stat = SZOMB; if (kp->ki_flag & P_INMEM) kp->ki_sflag = PS_INMEM; else kp->ki_sflag = 0; /* Calculate legacy swtime as seconds since 'swtick'. */ kp->ki_swtime = (ticks - p->p_swtick) / hz; kp->ki_pid = p->p_pid; kp->ki_nice = p->p_nice; rufetch(p, &kp->ki_rusage); kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime); PROC_SUNLOCK(p); if ((p->p_flag & P_INMEM) && p->p_stats != NULL) { kp->ki_start = p->p_stats->p_start; timevaladd(&kp->ki_start, &boottime); PROC_SLOCK(p); calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime); PROC_SUNLOCK(p); calccru(p, &kp->ki_childutime, &kp->ki_childstime); /* Some callers want child-times in a single value */ kp->ki_childtime = kp->ki_childstime; timevaladd(&kp->ki_childtime, &kp->ki_childutime); } tp = NULL; if (p->p_pgrp) { kp->ki_pgid = p->p_pgrp->pg_id; kp->ki_jobc = p->p_pgrp->pg_jobc; sp = p->p_pgrp->pg_session; if (sp != NULL) { kp->ki_sid = sp->s_sid; SESS_LOCK(sp); strlcpy(kp->ki_login, sp->s_login, sizeof(kp->ki_login)); if (sp->s_ttyvp) kp->ki_kiflag |= KI_CTTY; if (SESS_LEADER(p)) kp->ki_kiflag |= KI_SLEADER; tp = sp->s_ttyp; SESS_UNLOCK(sp); } } if ((p->p_flag & P_CONTROLT) && tp != NULL) { kp->ki_tdev = dev2udev(tp->t_dev); kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; if (tp->t_session) kp->ki_tsid = tp->t_session->s_sid; } else kp->ki_tdev = NODEV; if (p->p_comm[0] != '\0') strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm)); if (p->p_sysent && p->p_sysent->sv_name != NULL && p->p_sysent->sv_name[0] != '\0') strlcpy(kp->ki_emul, p->p_sysent->sv_name, sizeof(kp->ki_emul)); kp->ki_siglist = p->p_siglist; kp->ki_xstat = p->p_xstat; kp->ki_acflag = p->p_acflag; kp->ki_lock = p->p_lock; if (p->p_pptr) kp->ki_ppid = p->p_pptr->p_pid; } /* * Fill in information that is thread specific. * Must be called with p_slock locked. */ static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp) { struct proc *p; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); thread_lock(td); if (td->td_wmesg != NULL) strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg)); else bzero(kp->ki_wmesg, sizeof(kp->ki_wmesg)); if (td->td_name[0] != '\0') strlcpy(kp->ki_ocomm, td->td_name, sizeof(kp->ki_ocomm)); if (TD_ON_LOCK(td)) { kp->ki_kiflag |= KI_LOCKBLOCK; strlcpy(kp->ki_lockname, td->td_lockname, sizeof(kp->ki_lockname)); } else { kp->ki_kiflag &= ~KI_LOCKBLOCK; bzero(kp->ki_lockname, sizeof(kp->ki_lockname)); } if (p->p_state == PRS_NORMAL) { /* XXXKSE very approximate */ if (TD_ON_RUNQ(td) || TD_CAN_RUN(td) || TD_IS_RUNNING(td)) { kp->ki_stat = SRUN; } else if (P_SHOULDSTOP(p)) { kp->ki_stat = SSTOP; } else if (TD_IS_SLEEPING(td)) { kp->ki_stat = SSLEEP; } else if (TD_ON_LOCK(td)) { kp->ki_stat = SLOCK; } else { kp->ki_stat = SWAIT; } } else if (p->p_state == PRS_ZOMBIE) { kp->ki_stat = SZOMB; } else { kp->ki_stat = SIDL; } /* Things in the thread */ kp->ki_wchan = td->td_wchan; kp->ki_pri.pri_level = td->td_priority; kp->ki_pri.pri_native = td->td_base_pri; kp->ki_lastcpu = td->td_lastcpu; kp->ki_oncpu = td->td_oncpu; kp->ki_tdflags = td->td_flags; kp->ki_tid = td->td_tid; kp->ki_numthreads = p->p_numthreads; kp->ki_pcb = td->td_pcb; kp->ki_kstack = (void *)td->td_kstack; kp->ki_pctcpu = sched_pctcpu(td); kp->ki_estcpu = td->td_estcpu; kp->ki_slptime = (ticks - td->td_slptick) / hz; kp->ki_pri.pri_class = td->td_pri_class; kp->ki_pri.pri_user = td->td_user_pri; /* We can't get this anymore but ps etc never used it anyway. */ kp->ki_rqindex = 0; SIGSETOR(kp->ki_siglist, td->td_siglist); kp->ki_sigmask = td->td_sigmask; thread_unlock(td); } /* * Fill in a kinfo_proc structure for the specified process. * Must be called with the target process locked. */ void fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp) { fill_kinfo_proc_only(p, kp); PROC_SLOCK(p); if (FIRST_THREAD_IN_PROC(p) != NULL) fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp); PROC_SUNLOCK(p); } struct pstats * pstats_alloc(void) { return (malloc(sizeof(struct pstats), M_SUBPROC, M_ZERO|M_WAITOK)); } /* * Copy parts of p_stats; zero the rest of p_stats (statistics). */ void pstats_fork(struct pstats *src, struct pstats *dst) { bzero(&dst->pstat_startzero, __rangeof(struct pstats, pstat_startzero, pstat_endzero)); bcopy(&src->pstat_startcopy, &dst->pstat_startcopy, __rangeof(struct pstats, pstat_startcopy, pstat_endcopy)); } void pstats_free(struct pstats *ps) { free(ps, M_SUBPROC); } /* * Locate a zombie process by number */ struct proc * zpfind(pid_t pid) { struct proc *p; sx_slock(&allproc_lock); LIST_FOREACH(p, &zombproc, p_list) if (p->p_pid == pid) { PROC_LOCK(p); break; } sx_sunlock(&allproc_lock); return (p); } #define KERN_PROC_ZOMBMASK 0x3 #define KERN_PROC_NOTHREADS 0x4 /* * Must be called with the process locked and will return with it unlocked. */ static int sysctl_out_proc(struct proc *p, struct sysctl_req *req, int flags) { struct thread *td; struct kinfo_proc kinfo_proc; int error = 0; struct proc *np; pid_t pid = p->p_pid; PROC_LOCK_ASSERT(p, MA_OWNED); fill_kinfo_proc_only(p, &kinfo_proc); if (flags & KERN_PROC_NOTHREADS) { PROC_SLOCK(p); if (FIRST_THREAD_IN_PROC(p) != NULL) fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), &kinfo_proc); PROC_SUNLOCK(p); error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc)); } else { PROC_SLOCK(p); if (FIRST_THREAD_IN_PROC(p) != NULL) FOREACH_THREAD_IN_PROC(p, td) { fill_kinfo_thread(td, &kinfo_proc); error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc)); if (error) break; } else error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc)); PROC_SUNLOCK(p); } PROC_UNLOCK(p); if (error) return (error); if (flags & KERN_PROC_ZOMBMASK) np = zpfind(pid); else { if (pid == 0) return (0); np = pfind(pid); } if (np == NULL) return EAGAIN; if (np != p) { PROC_UNLOCK(np); return EAGAIN; } PROC_UNLOCK(np); return (0); } static int sysctl_kern_proc(SYSCTL_HANDLER_ARGS) { int *name = (int*) arg1; u_int namelen = arg2; struct proc *p; int flags, doingzomb, oid_number; int error = 0; oid_number = oidp->oid_number; if (oid_number != KERN_PROC_ALL && (oid_number & KERN_PROC_INC_THREAD) == 0) flags = KERN_PROC_NOTHREADS; else { flags = 0; oid_number &= ~KERN_PROC_INC_THREAD; } if (oid_number == KERN_PROC_PID) { if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, 0); if (error) return (error); p = pfind((pid_t)name[0]); if (!p) return (ESRCH); if ((error = p_cansee(curthread, p))) { PROC_UNLOCK(p); return (error); } error = sysctl_out_proc(p, req, flags); return (error); } switch (oid_number) { case KERN_PROC_ALL: if (namelen != 0) return (EINVAL); break; case KERN_PROC_PROC: if (namelen != 0 && namelen != 1) return (EINVAL); break; default: if (namelen != 1) return (EINVAL); break; } if (!req->oldptr) { /* overestimate by 5 procs */ error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5); if (error) return (error); } error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sx_slock(&allproc_lock); for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) { if (!doingzomb) p = LIST_FIRST(&allproc); else p = LIST_FIRST(&zombproc); for (; p != 0; p = LIST_NEXT(p, p_list)) { /* * Skip embryonic processes. */ PROC_SLOCK(p); if (p->p_state == PRS_NEW) { PROC_SUNLOCK(p); continue; } PROC_SUNLOCK(p); PROC_LOCK(p); KASSERT(p->p_ucred != NULL, ("process credential is NULL for non-NEW proc")); /* * Show a user only appropriate processes. */ if (p_cansee(curthread, p)) { PROC_UNLOCK(p); continue; } /* * TODO - make more efficient (see notes below). * do by session. */ switch (oid_number) { case KERN_PROC_GID: if (p->p_ucred->cr_gid != (gid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_PGRP: /* could do this by traversing pgrp */ if (p->p_pgrp == NULL || p->p_pgrp->pg_id != (pid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_RGID: if (p->p_ucred->cr_rgid != (gid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_SESSION: if (p->p_session == NULL || p->p_session->s_sid != (pid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_TTY: if ((p->p_flag & P_CONTROLT) == 0 || p->p_session == NULL) { PROC_UNLOCK(p); continue; } SESS_LOCK(p->p_session); if (p->p_session->s_ttyp == NULL || dev2udev(p->p_session->s_ttyp->t_dev) != (dev_t)name[0]) { SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); continue; } SESS_UNLOCK(p->p_session); break; case KERN_PROC_UID: if (p->p_ucred->cr_uid != (uid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_RUID: if (p->p_ucred->cr_ruid != (uid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_PROC: break; default: break; } error = sysctl_out_proc(p, req, flags | doingzomb); if (error) { sx_sunlock(&allproc_lock); return (error); } } } sx_sunlock(&allproc_lock); return (0); } struct pargs * pargs_alloc(int len) { struct pargs *pa; MALLOC(pa, struct pargs *, sizeof(struct pargs) + len, M_PARGS, M_WAITOK); refcount_init(&pa->ar_ref, 1); pa->ar_length = len; return (pa); } void pargs_free(struct pargs *pa) { FREE(pa, M_PARGS); } void pargs_hold(struct pargs *pa) { if (pa == NULL) return; refcount_acquire(&pa->ar_ref); } void pargs_drop(struct pargs *pa) { if (pa == NULL) return; if (refcount_release(&pa->ar_ref)) pargs_free(pa); } /* * This sysctl allows a process to retrieve the argument list or process * title for another process without groping around in the address space * of the other process. It also allow a process to set its own "process * title to a string of its own choice. */ static int sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS) { int *name = (int*) arg1; u_int namelen = arg2; struct pargs *newpa, *pa; struct proc *p; int error = 0; if (namelen != 1) return (EINVAL); p = pfind((pid_t)name[0]); if (!p) return (ESRCH); if ((error = p_cansee(curthread, p)) != 0) { PROC_UNLOCK(p); return (error); } if (req->newptr && curproc != p) { PROC_UNLOCK(p); return (EPERM); } pa = p->p_args; pargs_hold(pa); PROC_UNLOCK(p); if (req->oldptr != NULL && pa != NULL) error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length); pargs_drop(pa); if (error != 0 || req->newptr == NULL) return (error); if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit) return (ENOMEM); newpa = pargs_alloc(req->newlen); error = SYSCTL_IN(req, newpa->ar_args, req->newlen); if (error != 0) { pargs_free(newpa); return (error); } PROC_LOCK(p); pa = p->p_args; p->p_args = newpa; PROC_UNLOCK(p); pargs_drop(pa); return (0); } /* * This sysctl allows a process to retrieve the path of the executable for * itself or another process. */ static int sysctl_kern_proc_pathname(SYSCTL_HANDLER_ARGS) { pid_t *pidp = (pid_t *)arg1; unsigned int arglen = arg2; struct proc *p; struct vnode *vp; char *retbuf, *freebuf; int error; if (arglen != 1) return (EINVAL); if (*pidp == -1) { /* -1 means this process */ p = req->td->td_proc; } else { p = pfind(*pidp); if (p == NULL) return (ESRCH); if ((error = p_cansee(curthread, p)) != 0) { PROC_UNLOCK(p); return (error); } } vp = p->p_textvp; vref(vp); if (*pidp != -1) PROC_UNLOCK(p); error = vn_fullpath(req->td, vp, &retbuf, &freebuf); vrele(vp); if (error) return (error); error = SYSCTL_OUT(req, retbuf, strlen(retbuf) + 1); free(freebuf, M_TEMP); return (error); } static int sysctl_kern_proc_sv_name(SYSCTL_HANDLER_ARGS) { struct proc *p; char *sv_name; int *name; int namelen; int error; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; if ((p = pfind((pid_t)name[0])) == NULL) return (ESRCH); if ((error = p_cansee(curthread, p))) { PROC_UNLOCK(p); return (error); } sv_name = p->p_sysent->sv_name; PROC_UNLOCK(p); return (sysctl_handle_string(oidp, sv_name, 0, req)); } static SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table"); SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT, 0, 0, sysctl_kern_proc, "S,proc", "Return entire process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_GID, gid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_RGID, rgid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SESSION, sid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD, sysctl_kern_proc, "Return process table, no threads"); static SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args, CTLFLAG_RW | CTLFLAG_ANYBODY, sysctl_kern_proc_args, "Process argument list"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PATHNAME, pathname, CTLFLAG_RD, sysctl_kern_proc_pathname, "Process executable path"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SV_NAME, sv_name, CTLFLAG_RD, sysctl_kern_proc_sv_name, "Process syscall vector name (ABI type)"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_GID | KERN_PROC_INC_THREAD), gid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PGRP | KERN_PROC_INC_THREAD), pgrp_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_RGID | KERN_PROC_INC_THREAD), rgid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_SESSION | KERN_PROC_INC_THREAD), sid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_TTY | KERN_PROC_INC_THREAD), tty_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_UID | KERN_PROC_INC_THREAD), uid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_RUID | KERN_PROC_INC_THREAD), ruid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PID | KERN_PROC_INC_THREAD), pid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC | KERN_PROC_INC_THREAD), proc_td, CTLFLAG_RD, sysctl_kern_proc, "Return process table, no threads"); Index: head/sys/kern/kern_sig.c =================================================================== --- head/sys/kern/kern_sig.c (revision 173360) +++ head/sys/kern/kern_sig.c (revision 173361) @@ -1,3324 +1,3324 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ONSIG 32 /* NSIG for osig* syscalls. XXX. */ static int coredump(struct thread *); static char *expand_name(const char *, uid_t, pid_t); static int killpg1(struct thread *td, int sig, int pgid, int all); static int issignal(struct thread *p); static int sigprop(int sig); static void tdsigwakeup(struct thread *, int, sig_t, int); static void sig_suspend_threads(struct thread *, struct proc *, int); static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); static struct thread *sigtd(struct proc *p, int sig, int prop); #ifdef KSE static int do_tdsignal(struct proc *, struct thread *, int, ksiginfo_t *); #endif static void sigqueue_start(void); static uma_zone_t ksiginfo_zone = NULL; struct filterops sig_filtops = { 0, filt_sigattach, filt_sigdetach, filt_signal }; -static int kern_logsigexit = 1; +int kern_logsigexit = 1; SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, "Log processes quitting on abnormal signals to syslog(3)"); static int kern_forcesigexit = 1; SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW, &kern_forcesigexit, 0, "Force trap signal to be handled"); SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0, "POSIX real time signal"); static int max_pending_per_proc = 128; SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW, &max_pending_per_proc, 0, "Max pending signals per proc"); static int preallocate_siginfo = 1024; TUNABLE_INT("kern.sigqueue.preallocate", &preallocate_siginfo); SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RD, &preallocate_siginfo, 0, "Preallocated signal memory size"); static int signal_overflow = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD, &signal_overflow, 0, "Number of signals overflew"); static int signal_alloc_fail = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD, &signal_alloc_fail, 0, "signals failed to be allocated"); SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL); /* * Policy -- Can ucred cr1 send SIGIO to process cr2? * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG * in the right situations. */ #define CANSIGIO(cr1, cr2) \ ((cr1)->cr_uid == 0 || \ (cr1)->cr_ruid == (cr2)->cr_ruid || \ (cr1)->cr_uid == (cr2)->cr_ruid || \ (cr1)->cr_ruid == (cr2)->cr_uid || \ (cr1)->cr_uid == (cr2)->cr_uid) int sugid_coredump; SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW, &sugid_coredump, 0, "Enable coredumping set user/group ID processes"); static int do_coredump = 1; SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, &do_coredump, 0, "Enable/Disable coredumps"); static int set_core_nodump_flag = 0; SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag, 0, "Enable setting the NODUMP flag on coredump files"); /* * Signal properties and actions. * The array below categorizes the signals and their default actions * according to the following properties: */ #define SA_KILL 0x01 /* terminates process by default */ #define SA_CORE 0x02 /* ditto and coredumps */ #define SA_STOP 0x04 /* suspend process */ #define SA_TTYSTOP 0x08 /* ditto, from tty */ #define SA_IGNORE 0x10 /* ignore by default */ #define SA_CONT 0x20 /* continue if suspended */ #define SA_CANTMASK 0x40 /* non-maskable, catchable */ #define SA_PROC 0x80 /* deliverable to any thread */ static int sigproptbl[NSIG] = { SA_KILL|SA_PROC, /* SIGHUP */ SA_KILL|SA_PROC, /* SIGINT */ SA_KILL|SA_CORE|SA_PROC, /* SIGQUIT */ SA_KILL|SA_CORE, /* SIGILL */ SA_KILL|SA_CORE, /* SIGTRAP */ SA_KILL|SA_CORE, /* SIGABRT */ SA_KILL|SA_CORE|SA_PROC, /* SIGEMT */ SA_KILL|SA_CORE, /* SIGFPE */ SA_KILL|SA_PROC, /* SIGKILL */ SA_KILL|SA_CORE, /* SIGBUS */ SA_KILL|SA_CORE, /* SIGSEGV */ SA_KILL|SA_CORE, /* SIGSYS */ SA_KILL|SA_PROC, /* SIGPIPE */ SA_KILL|SA_PROC, /* SIGALRM */ SA_KILL|SA_PROC, /* SIGTERM */ SA_IGNORE|SA_PROC, /* SIGURG */ SA_STOP|SA_PROC, /* SIGSTOP */ SA_STOP|SA_TTYSTOP|SA_PROC, /* SIGTSTP */ SA_IGNORE|SA_CONT|SA_PROC, /* SIGCONT */ SA_IGNORE|SA_PROC, /* SIGCHLD */ SA_STOP|SA_TTYSTOP|SA_PROC, /* SIGTTIN */ SA_STOP|SA_TTYSTOP|SA_PROC, /* SIGTTOU */ SA_IGNORE|SA_PROC, /* SIGIO */ SA_KILL, /* SIGXCPU */ SA_KILL, /* SIGXFSZ */ SA_KILL|SA_PROC, /* SIGVTALRM */ SA_KILL|SA_PROC, /* SIGPROF */ SA_IGNORE|SA_PROC, /* SIGWINCH */ SA_IGNORE|SA_PROC, /* SIGINFO */ SA_KILL|SA_PROC, /* SIGUSR1 */ SA_KILL|SA_PROC, /* SIGUSR2 */ }; static void sigqueue_start(void) { ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_prealloc(ksiginfo_zone, preallocate_siginfo); p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS); p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1); p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc); } ksiginfo_t * ksiginfo_alloc(int wait) { int flags; flags = M_ZERO; if (! wait) flags |= M_NOWAIT; if (ksiginfo_zone != NULL) return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags)); return (NULL); } void ksiginfo_free(ksiginfo_t *ksi) { uma_zfree(ksiginfo_zone, ksi); } static __inline int ksiginfo_tryfree(ksiginfo_t *ksi) { if (!(ksi->ksi_flags & KSI_EXT)) { uma_zfree(ksiginfo_zone, ksi); return (1); } return (0); } void sigqueue_init(sigqueue_t *list, struct proc *p) { SIGEMPTYSET(list->sq_signals); SIGEMPTYSET(list->sq_kill); TAILQ_INIT(&list->sq_list); list->sq_proc = p; list->sq_flags = SQ_INIT; } /* * Get a signal's ksiginfo. * Return: * 0 - signal not found * others - signal number */ int sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi, *next; int count = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (!SIGISMEMBER(sq->sq_signals, signo)) return (0); if (SIGISMEMBER(sq->sq_kill, signo)) { count++; SIGDELSET(sq->sq_kill, signo); } TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (ksi->ksi_signo == signo) { if (count == 0) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; ksiginfo_copy(ksi, si); if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } if (++count > 1) break; } } if (count <= 1) SIGDELSET(sq->sq_signals, signo); si->ksi_signo = signo; return (signo); } void sigqueue_take(ksiginfo_t *ksi) { struct ksiginfo *kp; struct proc *p; sigqueue_t *sq; if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL) return; p = sq->sq_proc; TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (!(ksi->ksi_flags & KSI_EXT) && p != NULL) p->p_pendingcnt--; for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL; kp = TAILQ_NEXT(kp, ksi_link)) { if (kp->ksi_signo == ksi->ksi_signo) break; } if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo)) SIGDELSET(sq->sq_signals, ksi->ksi_signo); } int sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi; int ret = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (signo == SIGKILL || signo == SIGSTOP || si == NULL) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } /* directly insert the ksi, don't copy it */ if (si->ksi_flags & KSI_INS) { TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link); si->ksi_sigq = sq; goto out_set_bit; } if (__predict_false(ksiginfo_zone == NULL)) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) { signal_overflow++; ret = EAGAIN; } else if ((ksi = ksiginfo_alloc(0)) == NULL) { signal_alloc_fail++; ret = EAGAIN; } else { if (p != NULL) p->p_pendingcnt++; ksiginfo_copy(si, ksi); ksi->ksi_signo = signo; TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = sq; } if ((si->ksi_flags & KSI_TRAP) != 0) { if (ret != 0) SIGADDSET(sq->sq_kill, signo); ret = 0; goto out_set_bit; } if (ret != 0) return (ret); out_set_bit: SIGADDSET(sq->sq_signals, signo); return (ret); } void sigqueue_flush(sigqueue_t *sq) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (p != NULL) PROC_LOCK_ASSERT(p, MA_OWNED); while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } SIGEMPTYSET(sq->sq_signals); SIGEMPTYSET(sq->sq_kill); } void sigqueue_collect_set(sigqueue_t *sq, sigset_t *set) { ksiginfo_t *ksi; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); TAILQ_FOREACH(ksi, &sq->sq_list, ksi_link) SIGADDSET(*set, ksi->ksi_signo); SIGSETOR(*set, sq->sq_kill); } void sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, sigset_t *setp) { sigset_t tmp, set; struct proc *p1, *p2; ksiginfo_t *ksi, *next; KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited")); KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited")); /* * make a copy, this allows setp to point to src or dst * sq_signals without trouble. */ set = *setp; p1 = src->sq_proc; p2 = dst->sq_proc; /* Move siginfo to target list */ TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) { if (SIGISMEMBER(set, ksi->ksi_signo)) { TAILQ_REMOVE(&src->sq_list, ksi, ksi_link); if (p1 != NULL) p1->p_pendingcnt--; TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link); ksi->ksi_sigq = dst; if (p2 != NULL) p2->p_pendingcnt++; } } /* Move pending bits to target list */ tmp = src->sq_kill; SIGSETAND(tmp, set); SIGSETOR(dst->sq_kill, tmp); SIGSETNAND(src->sq_kill, tmp); tmp = src->sq_signals; SIGSETAND(tmp, set); SIGSETOR(dst->sq_signals, tmp); SIGSETNAND(src->sq_signals, tmp); /* Finally, rescan src queue and set pending bits for it */ sigqueue_collect_set(src, &src->sq_signals); } void sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_move_set(src, dst, &set); } void sigqueue_delete_set(sigqueue_t *sq, sigset_t *set) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi, *next; KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited")); /* Remove siginfo queue */ TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } } SIGSETNAND(sq->sq_kill, *set); SIGSETNAND(sq->sq_signals, *set); /* Finally, rescan queue and set pending bits for it */ sigqueue_collect_set(sq, &sq->sq_signals); } void sigqueue_delete(sigqueue_t *sq, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set(sq, &set); } /* Remove a set of signals for a process */ void sigqueue_delete_set_proc(struct proc *p, sigset_t *set) { sigqueue_t worklist; struct thread *td0; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_init(&worklist, NULL); sigqueue_move_set(&p->p_sigqueue, &worklist, set); PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td0) sigqueue_move_set(&td0->td_sigqueue, &worklist, set); PROC_SUNLOCK(p); sigqueue_flush(&worklist); } void sigqueue_delete_proc(struct proc *p, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set_proc(p, &set); } void sigqueue_delete_stopmask_proc(struct proc *p) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, SIGSTOP); SIGADDSET(set, SIGTSTP); SIGADDSET(set, SIGTTIN); SIGADDSET(set, SIGTTOU); sigqueue_delete_set_proc(p, &set); } /* * Determine signal that should be delivered to process p, the current * process, 0 if none. If there is a pending stop signal with default * action, the process stops in issignal(). */ int cursig(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_NOTOWNED); return (SIGPENDING(td) ? issignal(td) : 0); } /* * Arrange for ast() to handle unmasked pending signals on return to user * mode. This must be called whenever a signal is added to td_sigqueue or * unmasked in td_sigmask. */ void signotify(struct thread *td) { struct proc *p; #ifdef KSE sigset_t set, saved; #else sigset_t set; #endif p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); /* * If our mask changed we may have to move signal that were * previously masked by all threads to our sigqueue. */ set = p->p_sigqueue.sq_signals; #ifdef KSE if (p->p_flag & P_SA) saved = p->p_sigqueue.sq_signals; #endif SIGSETNAND(set, td->td_sigmask); if (! SIGISEMPTY(set)) sigqueue_move_set(&p->p_sigqueue, &td->td_sigqueue, &set); if (SIGPENDING(td)) { thread_lock(td); td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING; thread_unlock(td); } #ifdef KSE if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) { if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) { /* pending set changed */ p->p_flag |= P_SIGEVENT; wakeup(&p->p_siglist); } } #endif } int sigonstack(size_t sp) { struct thread *td = curthread; return ((td->td_pflags & TDP_ALTSTACK) ? #if defined(COMPAT_43) ((td->td_sigstk.ss_size == 0) ? (td->td_sigstk.ss_flags & SS_ONSTACK) : ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size)) #else ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size) #endif : 0); } static __inline int sigprop(int sig) { if (sig > 0 && sig < NSIG) return (sigproptbl[_SIG_IDX(sig)]); return (0); } int sig_ffs(sigset_t *set) { int i; for (i = 0; i < _SIG_WORDS; i++) if (set->__bits[i]) return (ffs(set->__bits[i]) + (i * 32)); return (0); } /* * kern_sigaction * sigaction * freebsd4_sigaction * osigaction */ int kern_sigaction(td, sig, act, oact, flags) struct thread *td; register int sig; struct sigaction *act, *oact; int flags; { struct sigacts *ps; struct proc *p = td->td_proc; if (!_SIG_VALID(sig)) return (EINVAL); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if (oact) { oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)]; oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)]; oact->sa_flags = 0; if (SIGISMEMBER(ps->ps_sigonstack, sig)) oact->sa_flags |= SA_ONSTACK; if (!SIGISMEMBER(ps->ps_sigintr, sig)) oact->sa_flags |= SA_RESTART; if (SIGISMEMBER(ps->ps_sigreset, sig)) oact->sa_flags |= SA_RESETHAND; if (SIGISMEMBER(ps->ps_signodefer, sig)) oact->sa_flags |= SA_NODEFER; if (SIGISMEMBER(ps->ps_siginfo, sig)) oact->sa_flags |= SA_SIGINFO; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP) oact->sa_flags |= SA_NOCLDSTOP; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT) oact->sa_flags |= SA_NOCLDWAIT; } if (act) { if ((sig == SIGKILL || sig == SIGSTOP) && act->sa_handler != SIG_DFL) { mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (EINVAL); } /* * Change setting atomically. */ ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask; SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]); if (act->sa_flags & SA_SIGINFO) { ps->ps_sigact[_SIG_IDX(sig)] = (__sighandler_t *)act->sa_sigaction; SIGADDSET(ps->ps_siginfo, sig); } else { ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler; SIGDELSET(ps->ps_siginfo, sig); } if (!(act->sa_flags & SA_RESTART)) SIGADDSET(ps->ps_sigintr, sig); else SIGDELSET(ps->ps_sigintr, sig); if (act->sa_flags & SA_ONSTACK) SIGADDSET(ps->ps_sigonstack, sig); else SIGDELSET(ps->ps_sigonstack, sig); if (act->sa_flags & SA_RESETHAND) SIGADDSET(ps->ps_sigreset, sig); else SIGDELSET(ps->ps_sigreset, sig); if (act->sa_flags & SA_NODEFER) SIGADDSET(ps->ps_signodefer, sig); else SIGDELSET(ps->ps_signodefer, sig); if (sig == SIGCHLD) { if (act->sa_flags & SA_NOCLDSTOP) ps->ps_flag |= PS_NOCLDSTOP; else ps->ps_flag &= ~PS_NOCLDSTOP; if (act->sa_flags & SA_NOCLDWAIT) { /* * Paranoia: since SA_NOCLDWAIT is implemented * by reparenting the dying child to PID 1 (and * trust it to reap the zombie), PID 1 itself * is forbidden to set SA_NOCLDWAIT. */ if (p->p_pid == 1) ps->ps_flag &= ~PS_NOCLDWAIT; else ps->ps_flag |= PS_NOCLDWAIT; } else ps->ps_flag &= ~PS_NOCLDWAIT; if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_flag |= PS_CLDSIGIGN; else ps->ps_flag &= ~PS_CLDSIGIGN; } /* * Set bit in ps_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to * ignore. However, don't put SIGCONT in ps_sigignore, as we * have to restart the process. */ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || (sigprop(sig) & SA_IGNORE && ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) { #ifdef KSE if ((p->p_flag & P_SA) && SIGISMEMBER(p->p_sigqueue.sq_signals, sig)) { p->p_flag |= P_SIGEVENT; wakeup(&p->p_siglist); } #endif /* never to be seen again */ PROC_SLOCK(p); sigqueue_delete_proc(p, sig); PROC_SUNLOCK(p); if (sig != SIGCONT) /* easier in psignal */ SIGADDSET(ps->ps_sigignore, sig); SIGDELSET(ps->ps_sigcatch, sig); } else { SIGDELSET(ps->ps_sigignore, sig); if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL) SIGDELSET(ps->ps_sigcatch, sig); else SIGADDSET(ps->ps_sigcatch, sig); } #ifdef COMPAT_FREEBSD4 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_FREEBSD4) == 0) SIGDELSET(ps->ps_freebsd4, sig); else SIGADDSET(ps->ps_freebsd4, sig); #endif #ifdef COMPAT_43 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_OSIGSET) == 0) SIGDELSET(ps->ps_osigset, sig); else SIGADDSET(ps->ps_osigset, sig); #endif } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int sigaction(td, uap) struct thread *td; register struct sigaction_args *uap; { struct sigaction act, oact; register struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, 0); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #ifdef COMPAT_FREEBSD4 #ifndef _SYS_SYSPROTO_H_ struct freebsd4_sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int freebsd4_sigaction(td, uap) struct thread *td; register struct freebsd4_sigaction_args *uap; { struct sigaction act, oact; register struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #endif /* COMAPT_FREEBSD4 */ #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigaction_args { int signum; struct osigaction *nsa; struct osigaction *osa; }; #endif int osigaction(td, uap) struct thread *td; register struct osigaction_args *uap; { struct osigaction sa; struct sigaction nsa, osa; register struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsa != NULL) ? &nsa : NULL; osap = (uap->osa != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsa, &sa, sizeof(sa)); if (error) return (error); nsap->sa_handler = sa.sa_handler; nsap->sa_flags = sa.sa_flags; OSIG2SIG(sa.sa_mask, nsap->sa_mask); } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { sa.sa_handler = osap->sa_handler; sa.sa_flags = osap->sa_flags; SIG2OSIG(osap->sa_mask, sa.sa_mask); error = copyout(&sa, uap->osa, sizeof(sa)); } return (error); } #if !defined(__i386__) /* Avoid replicating the same stub everywhere */ int osigreturn(td, uap) struct thread *td; struct osigreturn_args *uap; { return (nosys(td, (struct nosys_args *)uap)); } #endif #endif /* COMPAT_43 */ /* * Initialize signal state for process 0; * set to ignore signals that are ignored by default. */ void siginit(p) struct proc *p; { register int i; struct sigacts *ps; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); for (i = 1; i <= NSIG; i++) if (sigprop(i) & SA_IGNORE && i != SIGCONT) SIGADDSET(ps->ps_sigignore, i); mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); } /* * Reset signals for an exec of the specified process. */ void execsigs(struct proc *p) { struct sigacts *ps; int sig; struct thread *td; /* * Reset caught signals. Held signals remain held * through td_sigmask (unless they were caught, * and are now ignored by default). */ PROC_LOCK_ASSERT(p, MA_OWNED); td = FIRST_THREAD_IN_PROC(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); while (SIGNOTEMPTY(ps->ps_sigcatch)) { sig = sig_ffs(&ps->ps_sigcatch); SIGDELSET(ps->ps_sigcatch, sig); if (sigprop(sig) & SA_IGNORE) { if (sig != SIGCONT) SIGADDSET(ps->ps_sigignore, sig); PROC_SLOCK(p); sigqueue_delete_proc(p, sig); PROC_SUNLOCK(p); } ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } /* * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. */ td->td_sigstk.ss_flags = SS_DISABLE; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_sp = 0; td->td_pflags &= ~TDP_ALTSTACK; /* * Reset no zombies if child dies flag as Solaris does. */ ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN); if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL; mtx_unlock(&ps->ps_mtx); } /* * kern_sigprocmask() * * Manipulate signal mask. */ int kern_sigprocmask(td, how, set, oset, old) struct thread *td; int how; sigset_t *set, *oset; int old; { int error; PROC_LOCK(td->td_proc); if (oset != NULL) *oset = td->td_sigmask; error = 0; if (set != NULL) { switch (how) { case SIG_BLOCK: SIG_CANTMASK(*set); SIGSETOR(td->td_sigmask, *set); break; case SIG_UNBLOCK: SIGSETNAND(td->td_sigmask, *set); signotify(td); break; case SIG_SETMASK: SIG_CANTMASK(*set); if (old) SIGSETLO(td->td_sigmask, *set); else td->td_sigmask = *set; signotify(td); break; default: error = EINVAL; break; } } PROC_UNLOCK(td->td_proc); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigprocmask_args { int how; const sigset_t *set; sigset_t *oset; }; #endif int sigprocmask(td, uap) register struct thread *td; struct sigprocmask_args *uap; { sigset_t set, oset; sigset_t *setp, *osetp; int error; setp = (uap->set != NULL) ? &set : NULL; osetp = (uap->oset != NULL) ? &oset : NULL; if (setp) { error = copyin(uap->set, setp, sizeof(set)); if (error) return (error); } error = kern_sigprocmask(td, uap->how, setp, osetp, 0); if (osetp && !error) { error = copyout(osetp, uap->oset, sizeof(oset)); } return (error); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigprocmask_args { int how; osigset_t mask; }; #endif int osigprocmask(td, uap) register struct thread *td; struct osigprocmask_args *uap; { sigset_t set, oset; int error; OSIG2SIG(uap->mask, set); error = kern_sigprocmask(td, uap->how, &set, &oset, 1); SIG2OSIG(oset, td->td_retval[0]); return (error); } #endif /* COMPAT_43 */ int sigwait(struct thread *td, struct sigwait_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) { td->td_retval[0] = error; return (0); } error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) { if (error == ERESTART) return (error); td->td_retval[0] = error; return (0); } error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo)); td->td_retval[0] = error; return (0); } int sigtimedwait(struct thread *td, struct sigtimedwait_args *uap) { struct timespec ts; struct timespec *timeout; sigset_t set; ksiginfo_t ksi; int error; if (uap->timeout) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); timeout = &ts; } else timeout = NULL; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, timeout); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } int sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } int kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi, struct timespec *timeout) { struct sigacts *ps; sigset_t savedmask; struct proc *p; int error, sig, hz, i, timevalid = 0; struct timespec rts, ets, ts; struct timeval tv; p = td->td_proc; error = 0; sig = 0; ets.tv_sec = 0; ets.tv_nsec = 0; SIG_CANTMASK(waitset); PROC_LOCK(p); ps = p->p_sigacts; savedmask = td->td_sigmask; if (timeout) { if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) { timevalid = 1; getnanouptime(&rts); ets = rts; timespecadd(&ets, timeout); } } restart: for (i = 1; i <= _SIG_MAXSIG; ++i) { if (!SIGISMEMBER(waitset, i)) continue; if (!SIGISMEMBER(td->td_sigqueue.sq_signals, i)) { if (SIGISMEMBER(p->p_sigqueue.sq_signals, i)) { #ifdef KSE if (p->p_flag & P_SA) { p->p_flag |= P_SIGEVENT; wakeup(&p->p_siglist); } #endif sigqueue_move(&p->p_sigqueue, &td->td_sigqueue, i); } else continue; } SIGFILLSET(td->td_sigmask); SIG_CANTMASK(td->td_sigmask); SIGDELSET(td->td_sigmask, i); mtx_lock(&ps->ps_mtx); sig = cursig(td); mtx_unlock(&ps->ps_mtx); if (sig) goto out; else { /* * Because cursig() may have stopped current thread, * after it is resumed, things may have already been * changed, it should rescan any pending signals. */ goto restart; } } if (error) goto out; /* * POSIX says this must be checked after looking for pending * signals. */ if (timeout) { if (!timevalid) { error = EINVAL; goto out; } getnanouptime(&rts); if (timespeccmp(&rts, &ets, >=)) { error = EAGAIN; goto out; } ts = ets; timespecsub(&ts, &rts); TIMESPEC_TO_TIMEVAL(&tv, &ts); hz = tvtohz(&tv); } else hz = 0; td->td_sigmask = savedmask; SIGSETNAND(td->td_sigmask, waitset); signotify(td); error = msleep(&ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", hz); if (timeout) { if (error == ERESTART) { /* timeout can not be restarted. */ error = EINTR; } else if (error == EAGAIN) { /* will calculate timeout by ourself. */ error = 0; } } goto restart; out: td->td_sigmask = savedmask; signotify(td); if (sig) { ksiginfo_init(ksi); sigqueue_get(&td->td_sigqueue, sig, ksi); ksi->ksi_signo = sig; if (ksi->ksi_code == SI_TIMER) itimer_accept(p, ksi->ksi_timerid, ksi); error = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) { sig_t action; mtx_lock(&ps->ps_mtx); action = ps->ps_sigact[_SIG_IDX(sig)]; mtx_unlock(&ps->ps_mtx); ktrpsig(sig, action, &td->td_sigmask, 0); } #endif if (sig == SIGKILL) sigexit(td, sig); } PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigpending_args { sigset_t *set; }; #endif int sigpending(td, uap) struct thread *td; struct sigpending_args *uap; { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); return (copyout(&pending, uap->set, sizeof(sigset_t))); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigpending_args { int dummy; }; #endif int osigpending(td, uap) struct thread *td; struct osigpending_args *uap; { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); SIG2OSIG(pending, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) /* * Generalized interface signal handler, 4.3-compatible. */ #ifndef _SYS_SYSPROTO_H_ struct osigvec_args { int signum; struct sigvec *nsv; struct sigvec *osv; }; #endif /* ARGSUSED */ int osigvec(td, uap) struct thread *td; register struct osigvec_args *uap; { struct sigvec vec; struct sigaction nsa, osa; register struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsv != NULL) ? &nsa : NULL; osap = (uap->osv != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsv, &vec, sizeof(vec)); if (error) return (error); nsap->sa_handler = vec.sv_handler; OSIG2SIG(vec.sv_mask, nsap->sa_mask); nsap->sa_flags = vec.sv_flags; nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { vec.sv_handler = osap->sa_handler; SIG2OSIG(osap->sa_mask, vec.sv_mask); vec.sv_flags = osap->sa_flags; vec.sv_flags &= ~SA_NOCLDWAIT; vec.sv_flags ^= SA_RESTART; error = copyout(&vec, uap->osv, sizeof(vec)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct osigblock_args { int mask; }; #endif int osigblock(td, uap) register struct thread *td; struct osigblock_args *uap; { struct proc *p = td->td_proc; sigset_t set; OSIG2SIG(uap->mask, set); SIG_CANTMASK(set); PROC_LOCK(p); SIG2OSIG(td->td_sigmask, td->td_retval[0]); SIGSETOR(td->td_sigmask, set); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct osigsetmask_args { int mask; }; #endif int osigsetmask(td, uap) struct thread *td; struct osigsetmask_args *uap; { struct proc *p = td->td_proc; sigset_t set; OSIG2SIG(uap->mask, set); SIG_CANTMASK(set); PROC_LOCK(p); SIG2OSIG(td->td_sigmask, td->td_retval[0]); SIGSETLO(td->td_sigmask, set); signotify(td); PROC_UNLOCK(p); return (0); } #endif /* COMPAT_43 */ /* * Suspend calling thread until signal, providing mask to be set in the * meantime. */ #ifndef _SYS_SYSPROTO_H_ struct sigsuspend_args { const sigset_t *sigmask; }; #endif /* ARGSUSED */ int sigsuspend(td, uap) struct thread *td; struct sigsuspend_args *uap; { sigset_t mask; int error; error = copyin(uap->sigmask, &mask, sizeof(mask)); if (error) return (error); return (kern_sigsuspend(td, mask)); } int kern_sigsuspend(struct thread *td, sigset_t mask) { struct proc *p = td->td_proc; /* * When returning from sigsuspend, we want * the old mask to be restored after the * signal handler has finished. Thus, we * save it here and mark the sigacts structure * to indicate this. */ PROC_LOCK(p); td->td_oldsigmask = td->td_sigmask; td->td_pflags |= TDP_OLDMASK; SIG_CANTMASK(mask); td->td_sigmask = mask; signotify(td); while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0) /* void */; PROC_UNLOCK(p); /* always return EINTR rather than ERESTART... */ return (EINTR); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ /* * Compatibility sigsuspend call for old binaries. Note nonstandard calling * convention: libc stub passes mask, not pointer, to save a copyin. */ #ifndef _SYS_SYSPROTO_H_ struct osigsuspend_args { osigset_t mask; }; #endif /* ARGSUSED */ int osigsuspend(td, uap) struct thread *td; struct osigsuspend_args *uap; { struct proc *p = td->td_proc; sigset_t mask; PROC_LOCK(p); td->td_oldsigmask = td->td_sigmask; td->td_pflags |= TDP_OLDMASK; OSIG2SIG(uap->mask, mask); SIG_CANTMASK(mask); SIGSETLO(td->td_sigmask, mask); signotify(td); while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "opause", 0) == 0) /* void */; PROC_UNLOCK(p); /* always return EINTR rather than ERESTART... */ return (EINTR); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osigstack_args { struct sigstack *nss; struct sigstack *oss; }; #endif /* ARGSUSED */ int osigstack(td, uap) struct thread *td; register struct osigstack_args *uap; { struct sigstack nss, oss; int error = 0; if (uap->nss != NULL) { error = copyin(uap->nss, &nss, sizeof(nss)); if (error) return (error); } oss.ss_sp = td->td_sigstk.ss_sp; oss.ss_onstack = sigonstack(cpu_getstack(td)); if (uap->nss != NULL) { td->td_sigstk.ss_sp = nss.ss_sp; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK; td->td_pflags |= TDP_ALTSTACK; } if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(oss)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigaltstack_args { stack_t *ss; stack_t *oss; }; #endif /* ARGSUSED */ int sigaltstack(td, uap) struct thread *td; register struct sigaltstack_args *uap; { stack_t ss, oss; int error; if (uap->ss != NULL) { error = copyin(uap->ss, &ss, sizeof(ss)); if (error) return (error); } error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL, (uap->oss != NULL) ? &oss : NULL); if (error) return (error); if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(stack_t)); return (error); } int kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss) { struct proc *p = td->td_proc; int oonstack; oonstack = sigonstack(cpu_getstack(td)); if (oss != NULL) { *oss = td->td_sigstk; oss->ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; } if (ss != NULL) { if (oonstack) return (EPERM); if ((ss->ss_flags & ~SS_DISABLE) != 0) return (EINVAL); if (!(ss->ss_flags & SS_DISABLE)) { if (ss->ss_size < p->p_sysent->sv_minsigstksz) return (ENOMEM); td->td_sigstk = *ss; td->td_pflags |= TDP_ALTSTACK; } else { td->td_pflags &= ~TDP_ALTSTACK; } } return (0); } /* * Common code for kill process group/broadcast kill. * cp is calling process. */ static int killpg1(td, sig, pgid, all) register struct thread *td; int sig, pgid, all; { register struct proc *p; struct pgrp *pgrp; int nfound = 0; if (all) { /* * broadcast */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p == td->td_proc || p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } if (p_cansignal(td, p, sig) == 0) { nfound++; if (sig) psignal(p, sig); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } else { sx_slock(&proctree_lock); if (pgid == 0) { /* * zero pgid means send to my process group. */ pgrp = td->td_proc->p_pgrp; PGRP_LOCK(pgrp); } else { pgrp = pgfind(pgid); if (pgrp == NULL) { sx_sunlock(&proctree_lock); return (ESRCH); } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p->p_state == PRS_NEW ) { PROC_UNLOCK(p); continue; } if (p_cansignal(td, p, sig) == 0) { nfound++; if (sig) psignal(p, sig); } PROC_UNLOCK(p); } PGRP_UNLOCK(pgrp); } return (nfound ? 0 : ESRCH); } #ifndef _SYS_SYSPROTO_H_ struct kill_args { int pid; int signum; }; #endif /* ARGSUSED */ int kill(td, uap) register struct thread *td; register struct kill_args *uap; { register struct proc *p; int error; AUDIT_ARG(signum, uap->signum); AUDIT_ARG(pid, uap->pid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); if (uap->pid > 0) { /* kill single process */ if ((p = pfind(uap->pid)) == NULL) { if ((p = zpfind(uap->pid)) == NULL) return (ESRCH); } AUDIT_ARG(process, p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) psignal(p, uap->signum); PROC_UNLOCK(p); return (error); } switch (uap->pid) { case -1: /* broadcast signal */ return (killpg1(td, uap->signum, 0, 1)); case 0: /* signal own process group */ return (killpg1(td, uap->signum, 0, 0)); default: /* negative explicit process group */ return (killpg1(td, uap->signum, -uap->pid, 0)); } /* NOTREACHED */ } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct okillpg_args { int pgid; int signum; }; #endif /* ARGSUSED */ int okillpg(td, uap) struct thread *td; register struct okillpg_args *uap; { AUDIT_ARG(signum, uap->signum); AUDIT_ARG(pid, uap->pgid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); return (killpg1(td, uap->signum, uap->pgid, 0)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigqueue_args { pid_t pid; int signum; /* union sigval */ void *value; }; #endif int sigqueue(struct thread *td, struct sigqueue_args *uap) { ksiginfo_t ksi; struct proc *p; int error; if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); /* * Specification says sigqueue can only send signal to * single process. */ if (uap->pid <= 0) return (EINVAL); if ((p = pfind(uap->pid)) == NULL) { if ((p = zpfind(uap->pid)) == NULL) return (ESRCH); } error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum != 0) { ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_QUEUE; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; ksi.ksi_value.sival_ptr = uap->value; error = tdsignal(p, NULL, ksi.ksi_signo, &ksi); } PROC_UNLOCK(p); return (error); } /* * Send a signal to a process group. */ void gsignal(pgid, sig) int pgid, sig; { struct pgrp *pgrp; if (pgid != 0) { sx_slock(&proctree_lock); pgrp = pgfind(pgid); sx_sunlock(&proctree_lock); if (pgrp != NULL) { pgsignal(pgrp, sig, 0); PGRP_UNLOCK(pgrp); } } } /* * Send a signal to a process group. If checktty is 1, * limit to members which have a controlling terminal. */ void pgsignal(pgrp, sig, checkctty) struct pgrp *pgrp; int sig, checkctty; { register struct proc *p; if (pgrp) { PGRP_LOCK_ASSERT(pgrp, MA_OWNED); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (checkctty == 0 || p->p_flag & P_CONTROLT) psignal(p, sig); PROC_UNLOCK(p); } } } /* * Send a signal caused by a trap to the current thread. If it will be * caught immediately, deliver it with correct code. Otherwise, post it * normally. */ void trapsignal(struct thread *td, ksiginfo_t *ksi) { struct sigacts *ps; struct proc *p; #ifdef KSE int error; #endif int sig; int code; p = td->td_proc; sig = ksi->ksi_signo; code = ksi->ksi_code; KASSERT(_SIG_VALID(sig), ("invalid signal")); #ifdef KSE if (td->td_pflags & TDP_SA) { if (td->td_mailbox == NULL) thread_user_enter(td); PROC_LOCK(p); SIGDELSET(td->td_sigmask, sig); thread_lock(td); /* * Force scheduling an upcall, so UTS has chance to * process the signal before thread runs again in * userland. */ if (td->td_upcall) td->td_upcall->ku_flags |= KUF_DOUPCALL; thread_unlock(td); } else { PROC_LOCK(p); } #else PROC_LOCK(p); #endif ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig)) { td->td_ru.ru_nsignals++; #ifdef KTRACE if (KTRPOINT(curthread, KTR_PSIG)) ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)], &td->td_sigmask, code); #endif #ifdef KSE if (!(td->td_pflags & TDP_SA)) (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); #else (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); #endif #ifdef KSE else if (td->td_mailbox == NULL) { mtx_unlock(&ps->ps_mtx); /* UTS caused a sync signal */ p->p_code = code; /* XXX for core dump/debugger */ p->p_sig = sig; /* XXX to verify code */ sigexit(td, sig); } else { mtx_unlock(&ps->ps_mtx); SIGADDSET(td->td_sigmask, sig); PROC_UNLOCK(p); error = copyout(&ksi->ksi_info, &td->td_mailbox->tm_syncsig, sizeof(siginfo_t)); PROC_LOCK(p); /* UTS memory corrupted */ if (error) sigexit(td, SIGSEGV); mtx_lock(&ps->ps_mtx); } #endif SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]); if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(td->td_sigmask, sig); if (SIGISMEMBER(ps->ps_sigreset, sig)) { /* * See kern_sigaction() for origin of this code. */ SIGDELSET(ps->ps_sigcatch, sig); if (sig != SIGCONT && sigprop(sig) & SA_IGNORE) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } mtx_unlock(&ps->ps_mtx); } else { /* * Avoid a possible infinite loop if the thread * masking the signal or process is ignoring the * signal. */ if (kern_forcesigexit && (SIGISMEMBER(td->td_sigmask, sig) || ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) { SIGDELSET(td->td_sigmask, sig); SIGDELSET(ps->ps_sigcatch, sig); SIGDELSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } mtx_unlock(&ps->ps_mtx); p->p_code = code; /* XXX for core dump/debugger */ p->p_sig = sig; /* XXX to verify code */ tdsignal(p, td, sig, ksi); } PROC_UNLOCK(p); } static struct thread * sigtd(struct proc *p, int sig, int prop) { struct thread *td, *signal_td; PROC_LOCK_ASSERT(p, MA_OWNED); /* * Check if current thread can handle the signal without * switching conetxt to another thread. */ if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig)) return (curthread); signal_td = NULL; PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { if (!SIGISMEMBER(td->td_sigmask, sig)) { signal_td = td; break; } } if (signal_td == NULL) signal_td = FIRST_THREAD_IN_PROC(p); PROC_SUNLOCK(p); return (signal_td); } /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. * o SIGCONT restarts stopped processes (or puts them back to sleep) * regardless of the signal action (eg, blocked or ignored). * * Other ignored signals are discarded immediately. * * NB: This function may be entered from the debugger via the "kill" DDB * command. There is little that can be done to mitigate the possibly messy * side effects of this unwise possibility. */ void psignal(struct proc *p, int sig) { (void) tdsignal(p, NULL, sig, NULL); } int psignal_event(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi) { struct thread *td = NULL; PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(!KSI_ONQ(ksi), ("psignal_event: ksi on queue")); /* * ksi_code and other fields should be set before * calling this function. */ ksi->ksi_signo = sigev->sigev_signo; ksi->ksi_value = sigev->sigev_value; if (sigev->sigev_notify == SIGEV_THREAD_ID) { td = thread_find(p, sigev->sigev_notify_thread_id); if (td == NULL) return (ESRCH); } return (tdsignal(p, td, ksi->ksi_signo, ksi)); } int tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) { #ifdef KSE sigset_t saved; int ret; if (p->p_flag & P_SA) saved = p->p_sigqueue.sq_signals; ret = do_tdsignal(p, td, sig, ksi); if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) { if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) { /* pending set changed */ p->p_flag |= P_SIGEVENT; wakeup(&p->p_siglist); } } return (ret); } static int do_tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) { #endif sig_t action; sigqueue_t *sigqueue; int prop; struct sigacts *ps; int intrval; int ret = 0; PROC_LOCK_ASSERT(p, MA_OWNED); if (!_SIG_VALID(sig)) #ifdef KSE panic("do_tdsignal(): invalid signal %d", sig); #else panic("tdsignal(): invalid signal %d", sig); #endif #ifdef KSE KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("do_tdsignal: ksi on queue")); #else KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("tdsignal: ksi on queue")); #endif /* * IEEE Std 1003.1-2001: return success when killing a zombie. */ if (p->p_state == PRS_ZOMBIE) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } ps = p->p_sigacts; KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig); prop = sigprop(sig); /* * If the signal is blocked and not destined for this thread, then * assign it to the process so that we can find it later in the first * thread that unblocks it. Otherwise, assign it to this thread now. */ if (td == NULL) { td = sigtd(p, sig, prop); if (SIGISMEMBER(td->td_sigmask, sig)) sigqueue = &p->p_sigqueue; else sigqueue = &td->td_sigqueue; } else { KASSERT(td->td_proc == p, ("invalid thread")); sigqueue = &td->td_sigqueue; } /* * If the signal is being ignored, * then we forget about it immediately. * (Note: we don't set SIGCONT in ps_sigignore, * and if it is set to SIG_IGN, * action will be SIG_DFL here.) */ mtx_lock(&ps->ps_mtx); if (SIGISMEMBER(ps->ps_sigignore, sig)) { mtx_unlock(&ps->ps_mtx); if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } if (SIGISMEMBER(td->td_sigmask, sig)) action = SIG_HOLD; else if (SIGISMEMBER(ps->ps_sigcatch, sig)) action = SIG_CATCH; else action = SIG_DFL; if (SIGISMEMBER(ps->ps_sigintr, sig)) intrval = EINTR; else intrval = ERESTART; mtx_unlock(&ps->ps_mtx); if (prop & SA_CONT) sigqueue_delete_stopmask_proc(p); else if (prop & SA_STOP) { /* * If sending a tty stop signal to a member of an orphaned * process group, discard the signal here if the action * is default; don't stop the process below if sleeping, * and don't clear any pending SIGCONT. */ if ((prop & SA_TTYSTOP) && (p->p_pgrp->pg_jobc == 0) && (action == SIG_DFL)) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } PROC_SLOCK(p); sigqueue_delete_proc(p, SIGCONT); PROC_SUNLOCK(p); if (p->p_flag & P_CONTINUED) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(p->p_pptr); sigqueue_take(p->p_ksi); PROC_UNLOCK(p->p_pptr); } } ret = sigqueue_add(sigqueue, sig, ksi); if (ret != 0) return (ret); signotify(td); /* * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ if (action == SIG_HOLD && !((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG))) return (ret); /* * SIGKILL: Remove procfs STOPEVENTs. */ if (sig == SIGKILL) { /* from procfs_ioctl.c: PIOCBIC */ p->p_stops = 0; /* from procfs_ioctl.c: PIOCCONT */ p->p_step = 0; wakeup(&p->p_step); } /* * Some signals have a process-wide effect and a per-thread * component. Most processing occurs when the process next * tries to cross the user boundary, however there are some * times when processing needs to be done immediatly, such as * waking up threads so that they can cross the user boundary. * We try do the per-process part here. */ PROC_SLOCK(p); if (P_SHOULDSTOP(p)) { /* * The process is in stopped mode. All the threads should be * either winding down or already on the suspended queue. */ if (p->p_flag & P_TRACED) { /* * The traced process is already stopped, * so no further action is necessary. * No signal can restart us. */ PROC_SUNLOCK(p); goto out; } if (sig == SIGKILL) { /* * SIGKILL sets process running. * It will die elsewhere. * All threads must be restarted. */ p->p_flag &= ~P_STOPPED_SIG; goto runfast; } if (prop & SA_CONT) { /* * If SIGCONT is default (or ignored), we continue the * process but don't leave the signal in sigqueue as * it has no further action. If SIGCONT is held, we * continue the process and leave the signal in * sigqueue. If the process catches SIGCONT, let it * handle the signal itself. If it isn't waiting on * an event, it goes back to run state. * Otherwise, process goes back to sleep state. */ p->p_flag &= ~P_STOPPED_SIG; if (p->p_numthreads == p->p_suspcount) { PROC_SUNLOCK(p); p->p_flag |= P_CONTINUED; p->p_xstat = SIGCONT; PROC_LOCK(p->p_pptr); childproc_continued(p); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } if (action == SIG_DFL) { thread_unsuspend(p); PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); goto out; } if (action == SIG_CATCH) { #ifdef KSE /* * The process wants to catch it so it needs * to run at least one thread, but which one? * It would seem that the answer would be to * run an upcall in the next KSE to run, and * deliver the signal that way. In a NON KSE * process, we need to make sure that the * single thread is runnable asap. * XXXKSE for now however, make them all run. */ #endif /* * The process wants to catch it so it needs * to run at least one thread, but which one? */ goto runfast; } /* * The signal is not ignored or caught. */ thread_unsuspend(p); PROC_SUNLOCK(p); goto out; } if (prop & SA_STOP) { /* * Already stopped, don't need to stop again * (If we did the shell could get confused). * Just make sure the signal STOP bit set. */ PROC_SUNLOCK(p); p->p_flag |= P_STOPPED_SIG; sigqueue_delete(sigqueue, sig); goto out; } /* * All other kinds of signals: * If a thread is sleeping interruptibly, simulate a * wakeup so that when it is continued it will be made * runnable and can look at the signal. However, don't make * the PROCESS runnable, leave it stopped. * It may run a bit until it hits a thread_suspend_check(). */ thread_lock(td); if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR)) sleepq_abort(td, intrval); thread_unlock(td); PROC_SUNLOCK(p); goto out; /* * Mutexes are short lived. Threads waiting on them will * hit thread_suspend_check() soon. */ } else if (p->p_state == PRS_NORMAL) { if (p->p_flag & P_TRACED || action == SIG_CATCH) { thread_lock(td); tdsigwakeup(td, sig, action, intrval); thread_unlock(td); PROC_SUNLOCK(p); goto out; } MPASS(action == SIG_DFL); if (prop & SA_STOP) { if (p->p_flag & P_PPWAIT) { PROC_SUNLOCK(p); goto out; } p->p_flag |= P_STOPPED_SIG; p->p_xstat = sig; sig_suspend_threads(td, p, 1); if (p->p_numthreads == p->p_suspcount) { /* * only thread sending signal to another * process can reach here, if thread is sending * signal to its process, because thread does * not suspend itself here, p_numthreads * should never be equal to p_suspcount. */ thread_stopped(p); PROC_SUNLOCK(p); sigqueue_delete_proc(p, p->p_xstat); } else PROC_SUNLOCK(p); goto out; } else goto runfast; /* NOTREACHED */ } else { /* Not in "NORMAL" state. discard the signal. */ PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); goto out; } /* * The process is not stopped so we need to apply the signal to all the * running threads. */ runfast: thread_lock(td); tdsigwakeup(td, sig, action, intrval); thread_unlock(td); thread_unsuspend(p); PROC_SUNLOCK(p); out: /* If we jump here, proc slock should not be owned. */ PROC_SLOCK_ASSERT(p, MA_NOTOWNED); return (ret); } /* * The force of a signal has been directed against a single * thread. We need to see what we can do about knocking it * out of any sleep it may be in etc. */ static void tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval) { struct proc *p = td->td_proc; register int prop; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); prop = sigprop(sig); /* * Bring the priority of a thread up if we want it to get * killed in this lifetime. */ if (action == SIG_DFL && (prop & SA_KILL) && td->td_priority > PUSER) sched_prio(td, PUSER); if (TD_ON_SLEEPQ(td)) { /* * If thread is sleeping uninterruptibly * we can't interrupt the sleep... the signal will * be noticed when the process returns through * trap() or syscall(). */ if ((td->td_flags & TDF_SINTR) == 0) return; /* * If SIGCONT is default (or ignored) and process is * asleep, we are finished; the process should not * be awakened. */ if ((prop & SA_CONT) && action == SIG_DFL) { thread_unlock(td); PROC_SUNLOCK(p); sigqueue_delete(&p->p_sigqueue, sig); /* * It may be on either list in this state. * Remove from both for now. */ sigqueue_delete(&td->td_sigqueue, sig); PROC_SLOCK(p); thread_lock(td); return; } /* * Give low priority threads a better chance to run. */ if (td->td_priority > PUSER) sched_prio(td, PUSER); sleepq_abort(td, intrval); } else { /* * Other states do nothing with the signal immediately, * other than kicking ourselves if we are running. * It will either never be noticed, or noticed very soon. */ #ifdef SMP if (TD_IS_RUNNING(td) && td != curthread) forward_signal(td); #endif } } static void sig_suspend_threads(struct thread *td, struct proc *p, int sending) { struct thread *td2; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td2) { thread_lock(td2); if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) && (td2->td_flags & TDF_SINTR) && !TD_IS_SUSPENDED(td2)) { thread_suspend_one(td2); } else { if (sending || td != td2) td2->td_flags |= TDF_ASTPENDING; #ifdef SMP if (TD_IS_RUNNING(td2) && td2 != td) forward_signal(td2); #endif } thread_unlock(td2); } } int ptracestop(struct thread *td, int sig) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Stopping for traced signal"); thread_lock(td); td->td_flags |= TDF_XSIG; thread_unlock(td); td->td_xsig = sig; PROC_SLOCK(p); while ((p->p_flag & P_TRACED) && (td->td_flags & TDF_XSIG)) { if (p->p_flag & P_SINGLE_EXIT) { thread_lock(td); td->td_flags &= ~TDF_XSIG; thread_unlock(td); PROC_SUNLOCK(p); return (sig); } /* * Just make wait() to work, the last stopped thread * will win. */ p->p_xstat = sig; p->p_xthread = td; p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE); sig_suspend_threads(td, p, 0); stopme: thread_suspend_switch(td); if (!(p->p_flag & P_TRACED)) { break; } if (td->td_flags & TDF_DBSUSPEND) { if (p->p_flag & P_SINGLE_EXIT) break; goto stopme; } } PROC_SUNLOCK(p); return (td->td_xsig); } /* * If the current process has received a signal (should be caught or cause * termination, should interrupt current syscall), return the signal number. * Stop signals with default action are processed immediately, then cleared; * they aren't returned. This is checked after each entry to the system for * a syscall or trap (though this can usually be done without calling issignal * by checking the pending signal masks in cursig.) The normal call * sequence is * * while (sig = cursig(curthread)) * postsig(sig); */ static int issignal(td) struct thread *td; { struct proc *p; struct sigacts *ps; sigset_t sigpending; int sig, prop, newsig; p = td->td_proc; ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); for (;;) { int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG); sigpending = td->td_sigqueue.sq_signals; SIGSETNAND(sigpending, td->td_sigmask); if (p->p_flag & P_PPWAIT) SIG_STOPSIGMASK(sigpending); if (SIGISEMPTY(sigpending)) /* no signal to send */ return (0); sig = sig_ffs(&sigpending); if (p->p_stops & S_SIG) { mtx_unlock(&ps->ps_mtx); stopevent(p, S_SIG, sig); mtx_lock(&ps->ps_mtx); } /* * We should see pending but ignored signals * only if P_TRACED was on when they were posted. */ if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) { sigqueue_delete(&td->td_sigqueue, sig); #ifdef KSE if (td->td_pflags & TDP_SA) SIGADDSET(td->td_sigmask, sig); #endif continue; } if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) { /* * If traced, always stop. */ mtx_unlock(&ps->ps_mtx); newsig = ptracestop(td, sig); mtx_lock(&ps->ps_mtx); #ifdef KSE if (td->td_pflags & TDP_SA) SIGADDSET(td->td_sigmask, sig); #endif if (sig != newsig) { ksiginfo_t ksi; /* * clear old signal. * XXX shrug off debugger, it causes siginfo to * be thrown away. */ sigqueue_get(&td->td_sigqueue, sig, &ksi); /* * If parent wants us to take the signal, * then it will leave it in p->p_xstat; * otherwise we just look for signals again. */ if (newsig == 0) continue; sig = newsig; /* * Put the new signal into td_sigqueue. If the * signal is being masked, look for other signals. */ SIGADDSET(td->td_sigqueue.sq_signals, sig); #ifdef KSE if (td->td_pflags & TDP_SA) SIGDELSET(td->td_sigmask, sig); #endif if (SIGISMEMBER(td->td_sigmask, sig)) continue; signotify(td); } /* * If the traced bit got turned off, go back up * to the top to rescan signals. This ensures * that p_sig* and p_sigact are consistent. */ if ((p->p_flag & P_TRACED) == 0) continue; } prop = sigprop(sig); /* * Decide whether the signal should be returned. * Return the signal's number, or fall through * to clear it from the pending mask. */ switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) { case (intptr_t)SIG_DFL: /* * Don't take default actions on system processes. */ if (p->p_pid <= 1) { #ifdef DIAGNOSTIC /* * Are you sure you want to ignore SIGSEGV * in init? XXX */ printf("Process (pid %lu) got signal %d\n", (u_long)p->p_pid, sig); #endif break; /* == ignore */ } /* * If there is a pending stop signal to process * with default action, stop here, * then clear the signal. However, * if process is member of an orphaned * process group, ignore tty stop signals. */ if (prop & SA_STOP) { if (p->p_flag & P_TRACED || (p->p_pgrp->pg_jobc == 0 && prop & SA_TTYSTOP)) break; /* == ignore */ mtx_unlock(&ps->ps_mtx); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Catching SIGSTOP"); p->p_flag |= P_STOPPED_SIG; p->p_xstat = sig; PROC_SLOCK(p); sig_suspend_threads(td, p, 0); thread_suspend_switch(td); PROC_SUNLOCK(p); mtx_lock(&ps->ps_mtx); break; } else if (prop & SA_IGNORE) { /* * Except for SIGCONT, shouldn't get here. * Default action is to ignore; drop it. */ break; /* == ignore */ } else return (sig); /*NOTREACHED*/ case (intptr_t)SIG_IGN: /* * Masking above should prevent us ever trying * to take action on an ignored signal other * than SIGCONT, unless process is traced. */ if ((prop & SA_CONT) == 0 && (p->p_flag & P_TRACED) == 0) printf("issignal\n"); break; /* == ignore */ default: /* * This signal has an action, let * postsig() process it. */ return (sig); } sigqueue_delete(&td->td_sigqueue, sig); /* take the signal! */ } /* NOTREACHED */ } void thread_stopped(struct proc *p) { int n; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); n = p->p_suspcount; if (p == curproc) n++; if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) { PROC_SUNLOCK(p); p->p_flag &= ~P_WAITED; PROC_LOCK(p->p_pptr); childproc_stopped(p, (p->p_flag & P_TRACED) ? CLD_TRAPPED : CLD_STOPPED); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } } /* * Take the action for the specified signal * from the current set of pending signals. */ void postsig(sig) register int sig; { struct thread *td = curthread; register struct proc *p = td->td_proc; struct sigacts *ps; sig_t action; ksiginfo_t ksi; sigset_t returnmask; int code; KASSERT(sig != 0, ("postsig")); PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); ksiginfo_init(&ksi); sigqueue_get(&td->td_sigqueue, sig, &ksi); ksi.ksi_signo = sig; if (ksi.ksi_code == SI_TIMER) itimer_accept(p, ksi.ksi_timerid, &ksi); action = ps->ps_sigact[_SIG_IDX(sig)]; #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ? &td->td_oldsigmask : &td->td_sigmask, 0); #endif if (p->p_stops & S_SIG) { mtx_unlock(&ps->ps_mtx); stopevent(p, S_SIG, sig); mtx_lock(&ps->ps_mtx); } #ifdef KSE if (!(td->td_pflags & TDP_SA) && action == SIG_DFL) { #else if (action == SIG_DFL) { #endif /* * Default action, where the default is to kill * the process. (Other cases were ignored above.) */ mtx_unlock(&ps->ps_mtx); sigexit(td, sig); /* NOTREACHED */ } else { #ifdef KSE if (td->td_pflags & TDP_SA) { if (sig == SIGKILL) { mtx_unlock(&ps->ps_mtx); sigexit(td, sig); } } #endif /* * If we get here, the signal must be caught. */ KASSERT(action != SIG_IGN && !SIGISMEMBER(td->td_sigmask, sig), ("postsig action")); /* * Set the new mask value and also defer further * occurrences of this signal. * * Special case: user has done a sigsuspend. Here the * current mask is not of interest, but rather the * mask from before the sigsuspend is what we want * restored after the signal processing is completed. */ if (td->td_pflags & TDP_OLDMASK) { returnmask = td->td_oldsigmask; td->td_pflags &= ~TDP_OLDMASK; } else returnmask = td->td_sigmask; SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]); if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(td->td_sigmask, sig); if (SIGISMEMBER(ps->ps_sigreset, sig)) { /* * See kern_sigaction() for origin of this code. */ SIGDELSET(ps->ps_sigcatch, sig); if (sig != SIGCONT && sigprop(sig) & SA_IGNORE) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } td->td_ru.ru_nsignals++; if (p->p_sig != sig) { code = 0; } else { code = p->p_code; p->p_code = 0; p->p_sig = 0; } #ifdef KSE if (td->td_pflags & TDP_SA) thread_signal_add(curthread, &ksi); else (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); #else (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); #endif } } /* * Kill the current process for stated reason. */ void killproc(p, why) struct proc *p; char *why; { PROC_LOCK_ASSERT(p, MA_OWNED); CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, why); psignal(p, SIGKILL); } /* * Force the current process to exit with the specified signal, dumping core * if appropriate. We bypass the normal tests for masked and caught signals, * allowing unrecoverable failures to terminate the process without changing * signal state. Mark the accounting record with the signal termination. * If dumping core, save the signal number for the debugger. Calls exit and * does not return. */ void sigexit(td, sig) struct thread *td; int sig; { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_acflag |= AXSIG; /* * We must be single-threading to generate a core dump. This * ensures that the registers in the core file are up-to-date. * Also, the ELF dump handler assumes that the thread list doesn't * change out from under it. * * XXX If another thread attempts to single-thread before us * (e.g. via fork()), we won't get a dump at all. */ if ((sigprop(sig) & SA_CORE) && (thread_single(SINGLE_NO_EXIT) == 0)) { p->p_sig = sig; /* * Log signals which would cause core dumps * (Log as LOG_INFO to appease those who don't want * these messages.) * XXX : Todo, as well as euid, write out ruid too * Note that coredump() drops proc lock. */ if (coredump(td) == 0) sig |= WCOREFLAG; if (kern_logsigexit) log(LOG_INFO, "pid %d (%s), uid %d: exited on signal %d%s\n", p->p_pid, p->p_comm, td->td_ucred ? td->td_ucred->cr_uid : -1, sig &~ WCOREFLAG, sig & WCOREFLAG ? " (core dumped)" : ""); } else PROC_UNLOCK(p); exit1(td, W_EXITCODE(0, sig)); /* NOTREACHED */ } /* * Send queued SIGCHLD to parent when child process's state * is changed. */ static void sigparent(struct proc *p, int reason, int status) { PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); if (p->p_ksi != NULL) { p->p_ksi->ksi_signo = SIGCHLD; p->p_ksi->ksi_code = reason; p->p_ksi->ksi_status = status; p->p_ksi->ksi_pid = p->p_pid; p->p_ksi->ksi_uid = p->p_ucred->cr_ruid; if (KSI_ONQ(p->p_ksi)) return; } tdsignal(p->p_pptr, NULL, SIGCHLD, p->p_ksi); } static void childproc_jobstate(struct proc *p, int reason, int status) { struct sigacts *ps; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); /* * Wake up parent sleeping in kern_wait(), also send * SIGCHLD to parent, but SIGCHLD does not guarantee * that parent will awake, because parent may masked * the signal. */ p->p_pptr->p_flag |= P_STATCHILD; wakeup(p->p_pptr); ps = p->p_pptr->p_sigacts; mtx_lock(&ps->ps_mtx); if ((ps->ps_flag & PS_NOCLDSTOP) == 0) { mtx_unlock(&ps->ps_mtx); sigparent(p, reason, status); } else mtx_unlock(&ps->ps_mtx); } void childproc_stopped(struct proc *p, int reason) { childproc_jobstate(p, reason, p->p_xstat); } void childproc_continued(struct proc *p) { childproc_jobstate(p, CLD_CONTINUED, SIGCONT); } void childproc_exited(struct proc *p) { int reason; int status = p->p_xstat; /* convert to int */ reason = CLD_EXITED; if (WCOREDUMP(status)) reason = CLD_DUMPED; else if (WIFSIGNALED(status)) reason = CLD_KILLED; /* * XXX avoid calling wakeup(p->p_pptr), the work is * done in exit1(). */ sigparent(p, reason, status); } static char corefilename[MAXPATHLEN] = {"%N.core"}; SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename, sizeof(corefilename), "process corefile name format string"); /* * expand_name(name, uid, pid) * Expand the name described in corefilename, using name, uid, and pid. * corefilename is a printf-like string, with three format specifiers: * %N name of process ("name") * %P process id (pid) * %U user id (uid) * For example, "%N.core" is the default; they can be disabled completely * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". * This is controlled by the sysctl variable kern.corefile (see above). */ static char * expand_name(name, uid, pid) const char *name; uid_t uid; pid_t pid; { const char *format, *appendstr; char *temp; char buf[11]; /* Buffer for pid/uid -- max 4B */ size_t i, l, n; format = corefilename; temp = malloc(MAXPATHLEN, M_TEMP, M_NOWAIT | M_ZERO); if (temp == NULL) return (NULL); for (i = 0, n = 0; n < MAXPATHLEN && format[i]; i++) { switch (format[i]) { case '%': /* Format character */ i++; switch (format[i]) { case '%': appendstr = "%"; break; case 'N': /* process name */ appendstr = name; break; case 'P': /* process id */ sprintf(buf, "%u", pid); appendstr = buf; break; case 'U': /* user id */ sprintf(buf, "%u", uid); appendstr = buf; break; default: appendstr = ""; log(LOG_ERR, "Unknown format character %c in `%s'\n", format[i], format); } l = strlen(appendstr); if ((n + l) >= MAXPATHLEN) goto toolong; memcpy(temp + n, appendstr, l); n += l; break; default: temp[n++] = format[i]; } } if (format[i] != '\0') goto toolong; return (temp); toolong: log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too long\n", (long)pid, name, (u_long)uid); free(temp, M_TEMP); return (NULL); } /* * Dump a process' core. The main routine does some * policy checking, and creates the name of the coredump; * then it passes on a vnode and a size limit to the process-specific * coredump routine if there is one; if there _is not_ one, it returns * ENOSYS; otherwise it returns the error from the process-specific routine. */ static int coredump(struct thread *td) { struct proc *p = td->td_proc; register struct vnode *vp; register struct ucred *cred = td->td_ucred; struct flock lf; struct nameidata nd; struct vattr vattr; int error, error1, flags, locked; struct mount *mp; char *name; /* name of corefile */ off_t limit; int vfslocked; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td); _STOPEVENT(p, S_CORE, 0); name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid); if (name == NULL) { #ifdef AUDIT audit_proc_coredump(td, NULL, EINVAL); #endif return (EINVAL); } if (((sugid_coredump == 0) && p->p_flag & P_SUGID) || do_coredump == 0) { PROC_UNLOCK(p); #ifdef AUDIT audit_proc_coredump(td, name, EFAULT); #endif free(name, M_TEMP); return (EFAULT); } /* * Note that the bulk of limit checking is done after * the corefile is created. The exception is if the limit * for corefiles is 0, in which case we don't bother * creating the corefile at all. This layout means that * a corefile is truncated instead of not being created, * if it is larger than the limit. */ limit = (off_t)lim_cur(p, RLIMIT_CORE); PROC_UNLOCK(p); if (limit == 0) { #ifdef AUDIT audit_proc_coredump(td, name, EFBIG); #endif free(name, M_TEMP); return (EFBIG); } restart: NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, name, td); flags = O_CREAT | FWRITE | O_NOFOLLOW; error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR, NULL); if (error) { #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; /* Don't dump to non-regular files or files with links. */ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred, td) || vattr.va_nlink != 1) { VOP_UNLOCK(vp, 0, td); error = EFAULT; goto close; } VOP_UNLOCK(vp, 0, td); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_WRLCK; locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { lf.l_type = F_UNLCK; if (locked) VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); if ((error = vn_close(vp, FWRITE, cred, td)) != 0) goto out; if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto out; VFS_UNLOCK_GIANT(vfslocked); goto restart; } VATTR_NULL(&vattr); vattr.va_size = 0; if (set_core_nodump_flag) vattr.va_flags = UF_NODUMP; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VOP_LEASE(vp, td, cred, LEASE_WRITE); VOP_SETATTR(vp, &vattr, cred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); PROC_LOCK(p); p->p_acflag |= ACORE; PROC_UNLOCK(p); error = p->p_sysent->sv_coredump ? p->p_sysent->sv_coredump(td, vp, limit) : ENOSYS; if (locked) { lf.l_type = F_UNLCK; VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); } close: error1 = vn_close(vp, FWRITE, cred, td); if (error == 0) error = error1; out: #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Nonexistent system call-- signal process (may want to handle it). Flag * error in case process won't see signal immediately (blocked or ignored). */ #ifndef _SYS_SYSPROTO_H_ struct nosys_args { int dummy; }; #endif /* ARGSUSED */ int nosys(td, args) struct thread *td; struct nosys_args *args; { struct proc *p = td->td_proc; PROC_LOCK(p); psignal(p, SIGSYS); PROC_UNLOCK(p); return (ENOSYS); } /* * Send a SIGIO or SIGURG signal to a process or process group using stored * credentials rather than those of the current process. */ void pgsigio(sigiop, sig, checkctty) struct sigio **sigiop; int sig, checkctty; { struct sigio *sigio; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } if (sigio->sio_pgid > 0) { PROC_LOCK(sigio->sio_proc); if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred)) psignal(sigio->sio_proc, sig); PROC_UNLOCK(sigio->sio_proc); } else if (sigio->sio_pgid < 0) { struct proc *p; PGRP_LOCK(sigio->sio_pgrp); LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (CANSIGIO(sigio->sio_ucred, p->p_ucred) && (checkctty == 0 || (p->p_flag & P_CONTROLT))) psignal(p, sig); PROC_UNLOCK(p); } PGRP_UNLOCK(sigio->sio_pgrp); } SIGIO_UNLOCK(); } static int filt_sigattach(struct knote *kn) { struct proc *p = curproc; kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ knlist_add(&p->p_klist, kn, 0); return (0); } static void filt_sigdetach(struct knote *kn) { struct proc *p = kn->kn_ptr.p_proc; knlist_remove(&p->p_klist, kn, 0); } /* * signal knotes are shared with proc knotes, so we apply a mask to * the hint in order to differentiate them from process hints. This * could be avoided by using a signal-specific knote list, but probably * isn't worth the trouble. */ static int filt_signal(struct knote *kn, long hint) { if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; if (kn->kn_id == hint) kn->kn_data++; } return (kn->kn_data != 0); } struct sigacts * sigacts_alloc(void) { struct sigacts *ps; ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO); ps->ps_refcnt = 1; mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF); return (ps); } void sigacts_free(struct sigacts *ps) { mtx_lock(&ps->ps_mtx); ps->ps_refcnt--; if (ps->ps_refcnt == 0) { mtx_destroy(&ps->ps_mtx); free(ps, M_SUBPROC); } else mtx_unlock(&ps->ps_mtx); } struct sigacts * sigacts_hold(struct sigacts *ps) { mtx_lock(&ps->ps_mtx); ps->ps_refcnt++; mtx_unlock(&ps->ps_mtx); return (ps); } void sigacts_copy(struct sigacts *dest, struct sigacts *src) { KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest")); mtx_lock(&src->ps_mtx); bcopy(src, dest, offsetof(struct sigacts, ps_refcnt)); mtx_unlock(&src->ps_mtx); } int sigacts_shared(struct sigacts *ps) { int shared; mtx_lock(&ps->ps_mtx); shared = ps->ps_refcnt > 1; mtx_unlock(&ps->ps_mtx); return (shared); } Index: head/sys/kern/kern_thr.c =================================================================== --- head/sys/kern/kern_thr.c (revision 173360) +++ head/sys/kern/kern_thr.c (revision 173361) @@ -1,509 +1,511 @@ /*- * Copyright (c) 2003, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_posix.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_IA32 extern struct sysentvec ia32_freebsd_sysvec; static inline int suword_lwpid(void *addr, lwpid_t lwpid) { int error; if (curproc->p_sysent != &ia32_freebsd_sysvec) error = suword(addr, lwpid); else error = suword32(addr, lwpid); return (error); } #else #define suword_lwpid suword #endif extern int max_threads_per_proc; static int create_thread(struct thread *td, mcontext_t *ctx, void (*start_func)(void *), void *arg, char *stack_base, size_t stack_size, char *tls_base, long *child_tid, long *parent_tid, int flags, struct rtprio *rtp); /* * System call interface. */ int thr_create(struct thread *td, struct thr_create_args *uap) /* ucontext_t *ctx, long *id, int flags */ { ucontext_t ctx; int error; if ((error = copyin(uap->ctx, &ctx, sizeof(ctx)))) return (error); error = create_thread(td, &ctx.uc_mcontext, NULL, NULL, NULL, 0, NULL, uap->id, NULL, uap->flags, NULL); return (error); } int thr_new(struct thread *td, struct thr_new_args *uap) /* struct thr_param * */ { struct thr_param param; int error; if (uap->param_size < 0 || uap->param_size > sizeof(param)) return (EINVAL); bzero(¶m, sizeof(param)); if ((error = copyin(uap->param, ¶m, uap->param_size))) return (error); return (kern_thr_new(td, ¶m)); } int kern_thr_new(struct thread *td, struct thr_param *param) { struct rtprio rtp, *rtpp; int error; rtpp = NULL; if (param->rtp != 0) { error = copyin(param->rtp, &rtp, sizeof(struct rtprio)); rtpp = &rtp; } error = create_thread(td, NULL, param->start_func, param->arg, param->stack_base, param->stack_size, param->tls_base, param->child_tid, param->parent_tid, param->flags, rtpp); return (error); } static int create_thread(struct thread *td, mcontext_t *ctx, void (*start_func)(void *), void *arg, char *stack_base, size_t stack_size, char *tls_base, long *child_tid, long *parent_tid, int flags, struct rtprio *rtp) { stack_t stack; struct thread *newtd; struct proc *p; int error; error = 0; p = td->td_proc; /* Have race condition but it is cheap. */ if (p->p_numthreads >= max_threads_per_proc) return (EPROCLIM); if (rtp != NULL) { switch(rtp->type) { case RTP_PRIO_REALTIME: case RTP_PRIO_FIFO: /* Only root can set scheduler policy */ if (priv_check(td, PRIV_SCHED_SETPOLICY) != 0) return (EPERM); if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); break; case RTP_PRIO_NORMAL: rtp->prio = 0; break; default: return (EINVAL); } } /* Initialize our td */ newtd = thread_alloc(); + if (newtd == NULL) + return (ENOMEM); /* * Try the copyout as soon as we allocate the td so we don't * have to tear things down in a failure case below. * Here we copy out tid to two places, one for child and one * for parent, because pthread can create a detached thread, * if parent wants to safely access child tid, it has to provide * its storage, because child thread may exit quickly and * memory is freed before parent thread can access it. */ if ((child_tid != NULL && suword_lwpid(child_tid, newtd->td_tid)) || (parent_tid != NULL && suword_lwpid(parent_tid, newtd->td_tid))) { thread_free(newtd); return (EFAULT); } bzero(&newtd->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &newtd->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); newtd->td_proc = td->td_proc; newtd->td_ucred = crhold(td->td_ucred); cpu_set_upcall(newtd, td); if (ctx != NULL) { /* old way to set user context */ error = set_mcontext(newtd, ctx); if (error != 0) { thread_free(newtd); crfree(td->td_ucred); return (error); } } else { /* Set up our machine context. */ stack.ss_sp = stack_base; stack.ss_size = stack_size; /* Set upcall address to user thread entry function. */ cpu_set_upcall_kse(newtd, start_func, arg, &stack); /* Setup user TLS address and TLS pointer register. */ error = cpu_set_user_tls(newtd, tls_base); if (error != 0) { thread_free(newtd); crfree(td->td_ucred); return (error); } } PROC_LOCK(td->td_proc); td->td_proc->p_flag |= P_HADTHREADS; newtd->td_sigmask = td->td_sigmask; PROC_SLOCK(p); thread_link(newtd, p); thread_lock(td); /* let the scheduler know about these things. */ sched_fork_thread(td, newtd); thread_unlock(td); PROC_SUNLOCK(p); PROC_UNLOCK(p); thread_lock(newtd); if (rtp != NULL) { if (!(td->td_pri_class == PRI_TIMESHARE && rtp->type == RTP_PRIO_NORMAL)) { rtp_to_pri(rtp, newtd); sched_prio(newtd, newtd->td_user_pri); } /* ignore timesharing class */ } TD_SET_CAN_RUN(newtd); /* if ((flags & THR_SUSPENDED) == 0) */ sched_add(newtd, SRQ_BORING); thread_unlock(newtd); return (error); } int thr_self(struct thread *td, struct thr_self_args *uap) /* long *id */ { int error; error = suword_lwpid(uap->id, (unsigned)td->td_tid); if (error == -1) return (EFAULT); return (0); } int thr_exit(struct thread *td, struct thr_exit_args *uap) /* long *state */ { struct proc *p; p = td->td_proc; /* Signal userland that it can free the stack. */ if ((void *)uap->state != NULL) { suword_lwpid(uap->state, 1); kern_umtx_wake(td, uap->state, INT_MAX); } PROC_LOCK(p); sigqueue_flush(&td->td_sigqueue); PROC_SLOCK(p); /* * Shutting down last thread in the proc. This will actually * call exit() in the trampoline when it returns. */ if (p->p_numthreads != 1) { thread_stopped(p); thread_exit(); /* NOTREACHED */ } PROC_SUNLOCK(p); PROC_UNLOCK(p); return (0); } int thr_kill(struct thread *td, struct thr_kill_args *uap) /* long id, int sig */ { struct thread *ttd; struct proc *p; int error; p = td->td_proc; error = 0; PROC_LOCK(p); if (uap->id == -1) { if (uap->sig != 0 && !_SIG_VALID(uap->sig)) { error = EINVAL; } else { error = ESRCH; FOREACH_THREAD_IN_PROC(p, ttd) { if (ttd != td) { error = 0; if (uap->sig == 0) break; tdsignal(p, ttd, uap->sig, NULL); } } } } else { if (uap->id != td->td_tid) ttd = thread_find(p, uap->id); else ttd = td; if (ttd == NULL) error = ESRCH; else if (uap->sig == 0) ; else if (!_SIG_VALID(uap->sig)) error = EINVAL; else tdsignal(p, ttd, uap->sig, NULL); } PROC_UNLOCK(p); return (error); } int thr_kill2(struct thread *td, struct thr_kill2_args *uap) /* pid_t pid, long id, int sig */ { struct thread *ttd; struct proc *p; int error; AUDIT_ARG(signum, uap->sig); if (uap->pid == td->td_proc->p_pid) { p = td->td_proc; PROC_LOCK(p); } else if ((p = pfind(uap->pid)) == NULL) { return (ESRCH); } AUDIT_ARG(process, p); error = p_cansignal(td, p, uap->sig); if (error == 0) { if (uap->id == -1) { if (uap->sig != 0 && !_SIG_VALID(uap->sig)) { error = EINVAL; } else { error = ESRCH; FOREACH_THREAD_IN_PROC(p, ttd) { if (ttd != td) { error = 0; if (uap->sig == 0) break; tdsignal(p, ttd, uap->sig, NULL); } } } } else { if (uap->id != td->td_tid) ttd = thread_find(p, uap->id); else ttd = td; if (ttd == NULL) error = ESRCH; else if (uap->sig == 0) ; else if (!_SIG_VALID(uap->sig)) error = EINVAL; else tdsignal(p, ttd, uap->sig, NULL); } } PROC_UNLOCK(p); return (error); } int thr_suspend(struct thread *td, struct thr_suspend_args *uap) /* const struct timespec *timeout */ { struct timespec ts, *tsp; int error; error = 0; tsp = NULL; if (uap->timeout != NULL) { error = copyin((const void *)uap->timeout, (void *)&ts, sizeof(struct timespec)); if (error != 0) return (error); tsp = &ts; } return (kern_thr_suspend(td, tsp)); } int kern_thr_suspend(struct thread *td, struct timespec *tsp) { struct timeval tv; int error = 0, hz = 0; if (tsp != NULL) { if (tsp->tv_nsec < 0 || tsp->tv_nsec > 1000000000) return (EINVAL); if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) return (ETIMEDOUT); TIMESPEC_TO_TIMEVAL(&tv, tsp); hz = tvtohz(&tv); } if (td->td_pflags & TDP_WAKEUP) { td->td_pflags &= ~TDP_WAKEUP; return (0); } PROC_LOCK(td->td_proc); if ((td->td_flags & TDF_THRWAKEUP) == 0) error = msleep((void *)td, &td->td_proc->p_mtx, PCATCH, "lthr", hz); if (td->td_flags & TDF_THRWAKEUP) { thread_lock(td); td->td_flags &= ~TDF_THRWAKEUP; thread_unlock(td); PROC_UNLOCK(td->td_proc); return (0); } PROC_UNLOCK(td->td_proc); if (error == EWOULDBLOCK) error = ETIMEDOUT; else if (error == ERESTART) { if (hz != 0) error = EINTR; } return (error); } int thr_wake(struct thread *td, struct thr_wake_args *uap) /* long id */ { struct proc *p; struct thread *ttd; if (uap->id == td->td_tid) { td->td_pflags |= TDP_WAKEUP; return (0); } p = td->td_proc; PROC_LOCK(p); ttd = thread_find(p, uap->id); if (ttd == NULL) { PROC_UNLOCK(p); return (ESRCH); } thread_lock(ttd); ttd->td_flags |= TDF_THRWAKEUP; thread_unlock(ttd); wakeup((void *)ttd); PROC_UNLOCK(p); return (0); } int thr_set_name(struct thread *td, struct thr_set_name_args *uap) { struct proc *p = td->td_proc; char name[MAXCOMLEN + 1]; struct thread *ttd; int error; error = 0; name[0] = '\0'; if (uap->name != NULL) { error = copyinstr(uap->name, name, sizeof(name), NULL); if (error) return (error); } PROC_LOCK(p); if (uap->id == td->td_tid) ttd = td; else ttd = thread_find(p, uap->id); if (ttd != NULL) strcpy(ttd->td_name, name); else error = ESRCH; PROC_UNLOCK(p); return (error); } Index: head/sys/kern/kern_thread.c =================================================================== --- head/sys/kern/kern_thread.c (revision 173360) +++ head/sys/kern/kern_thread.c (revision 173361) @@ -1,963 +1,980 @@ /*- * Copyright (C) 2001 Julian Elischer . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * thread related storage. */ static uma_zone_t thread_zone; SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation"); int max_threads_per_proc = 1500; SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW, &max_threads_per_proc, 0, "Limit on threads per proc"); int max_threads_hits; SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD, &max_threads_hits, 0, ""); #ifdef KSE int virtual_cpu; #endif TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads); static struct mtx zombie_lock; MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN); static void thread_zombie(struct thread *); #ifdef KSE static int sysctl_kse_virtual_cpu(SYSCTL_HANDLER_ARGS) { int error, new_val; int def_val; def_val = mp_ncpus; if (virtual_cpu == 0) new_val = def_val; else new_val = virtual_cpu; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val < 0) return (EINVAL); virtual_cpu = new_val; return (0); } /* DEBUG ONLY */ SYSCTL_PROC(_kern_threads, OID_AUTO, virtual_cpu, CTLTYPE_INT|CTLFLAG_RW, 0, sizeof(virtual_cpu), sysctl_kse_virtual_cpu, "I", "debug virtual cpus"); #endif struct mtx tid_lock; static struct unrhdr *tid_unrhdr; /* * Prepare a thread for use. */ static int thread_ctor(void *mem, int size, void *arg, int flags) { struct thread *td; td = (struct thread *)mem; td->td_state = TDS_INACTIVE; td->td_oncpu = NOCPU; td->td_tid = alloc_unr(tid_unrhdr); td->td_syscalls = 0; /* * Note that td_critnest begins life as 1 because the thread is not * running and is thereby implicitly waiting to be on the receiving * end of a context switch. */ td->td_critnest = 1; #ifdef AUDIT audit_thread_alloc(td); #endif umtx_thread_alloc(td); return (0); } /* * Reclaim a thread after use. */ static void thread_dtor(void *mem, int size, void *arg) { struct thread *td; td = (struct thread *)mem; #ifdef INVARIANTS /* Verify that this thread is in a safe state to free. */ switch (td->td_state) { case TDS_INHIBITED: case TDS_RUNNING: case TDS_CAN_RUN: case TDS_RUNQ: /* * We must never unlink a thread that is in one of * these states, because it is currently active. */ panic("bad state for thread unlinking"); /* NOTREACHED */ case TDS_INACTIVE: break; default: panic("bad thread state"); /* NOTREACHED */ } #endif #ifdef AUDIT audit_thread_free(td); #endif free_unr(tid_unrhdr, td->td_tid); sched_newthread(td); } /* * Initialize type-stable parts of a thread (when newly created). */ static int thread_init(void *mem, int size, int flags) { struct thread *td; td = (struct thread *)mem; - vm_thread_new(td, 0); - cpu_thread_setup(td); td->td_sleepqueue = sleepq_alloc(); td->td_turnstile = turnstile_alloc(); td->td_sched = (struct td_sched *)&td[1]; sched_newthread(td); umtx_thread_init(td); + td->td_kstack = 0; return (0); } /* * Tear down type-stable parts of a thread (just before being discarded). */ static void thread_fini(void *mem, int size) { struct thread *td; td = (struct thread *)mem; turnstile_free(td->td_turnstile); sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); - vm_thread_dispose(td); } /* * For a newly created process, * link up all the structures and its initial threads etc. * called from: * {arch}/{arch}/machdep.c ia64_init(), init386() etc. * proc_dtor() (should go away) * proc_init() */ void +proc_linkup0(struct proc *p, struct thread *td) +{ + TAILQ_INIT(&p->p_threads); /* all threads in proc */ + proc_linkup(p, td); +} + +void proc_linkup(struct proc *p, struct thread *td) { - TAILQ_INIT(&p->p_threads); /* all threads in proc */ #ifdef KSE TAILQ_INIT(&p->p_upcalls); /* upcall list */ #endif sigqueue_init(&p->p_sigqueue, p); p->p_ksi = ksiginfo_alloc(1); if (p->p_ksi != NULL) { /* XXX p_ksi may be null if ksiginfo zone is not ready */ p->p_ksi->ksi_flags = KSI_EXT | KSI_INS; } LIST_INIT(&p->p_mqnotifier); p->p_numthreads = 0; thread_link(td, p); } /* * Initialize global thread allocation resources. */ void threadinit(void) { mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF); tid_unrhdr = new_unrhdr(PID_MAX + 1, INT_MAX, &tid_lock); thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(), thread_ctor, thread_dtor, thread_init, thread_fini, 16 - 1, 0); #ifdef KSE kseinit(); /* set up kse specific stuff e.g. upcall zone*/ #endif } /* * Place an unused thread on the zombie list. * Use the slpq as that must be unused by now. */ void thread_zombie(struct thread *td) { mtx_lock_spin(&zombie_lock); TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq); mtx_unlock_spin(&zombie_lock); } /* * Release a thread that has exited after cpu_throw(). */ void thread_stash(struct thread *td) { atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1); thread_zombie(td); } /* * Reap zombie kse resource. */ void thread_reap(void) { struct thread *td_first, *td_next; /* * Don't even bother to lock if none at this instant, * we really don't care about the next instant.. */ if (!TAILQ_EMPTY(&zombie_threads)) { mtx_lock_spin(&zombie_lock); td_first = TAILQ_FIRST(&zombie_threads); if (td_first) TAILQ_INIT(&zombie_threads); mtx_unlock_spin(&zombie_lock); while (td_first) { td_next = TAILQ_NEXT(td_first, td_slpq); if (td_first->td_ucred) crfree(td_first->td_ucred); thread_free(td_first); td_first = td_next; } } #ifdef KSE upcall_reap(); #endif } /* * Allocate a thread. */ struct thread * thread_alloc(void) { + struct thread *td; thread_reap(); /* check if any zombies to get */ - return (uma_zalloc(thread_zone, M_WAITOK)); + + td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK); + KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack")); + if (!vm_thread_new(td, 0)) { + uma_zfree(thread_zone, td); + return (NULL); + } + cpu_thread_setup(td); + return (td); } /* * Deallocate a thread. */ void thread_free(struct thread *td) { cpu_thread_clean(td); + if (td->td_altkstack != 0) + vm_thread_dispose_altkstack(td); + if (td->td_kstack != 0) + vm_thread_dispose(td); uma_zfree(thread_zone, td); } /* * Discard the current thread and exit from its context. * Always called with scheduler locked. * * Because we can't free a thread while we're operating under its context, * push the current thread into our CPU's deadthread holder. This means * we needn't worry about someone else grabbing our context before we * do a cpu_throw(). This may not be needed now as we are under schedlock. * Maybe we can just do a thread_stash() as thr_exit1 does. */ /* XXX * libthr expects its thread exit to return for the last * thread, meaning that the program is back to non-threaded * mode I guess. Because we do this (cpu_throw) unconditionally * here, they have their own version of it. (thr_exit1()) * that doesn't do it all if this was the last thread. * It is also called from thread_suspend_check(). * Of course in the end, they end up coming here through exit1 * anyhow.. After fixing 'thr' to play by the rules we should be able * to merge these two functions together. * * called from: * exit1() * kse_exit() * thr_exit() * ifdef KSE * thread_user_enter() * thread_userret() * endif * thread_suspend_check() */ void thread_exit(void) { uint64_t new_switchtime; struct thread *td; struct thread *td2; struct proc *p; td = curthread; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p != NULL, ("thread exiting without a process")); CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td, (long)p->p_pid, p->p_comm); KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending")); #ifdef AUDIT AUDIT_SYSCALL_EXIT(0, td); #endif #ifdef KSE if (td->td_standin != NULL) { /* * Note that we don't need to free the cred here as it * is done in thread_reap(). */ thread_zombie(td->td_standin); td->td_standin = NULL; } #endif umtx_thread_exit(td); /* * drop FPU & debug register state storage, or any other * architecture specific resources that * would not be on a new untouched process. */ cpu_thread_exit(td); /* XXXSMP */ /* Do the same timestamp bookkeeping that mi_switch() would do. */ new_switchtime = cpu_ticks(); p->p_rux.rux_runtime += (new_switchtime - PCPU_GET(switchtime)); PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); PCPU_INC(cnt.v_swtch); /* Save our resource usage in our process. */ td->td_ru.ru_nvcsw++; rucollect(&p->p_ru, &td->td_ru); /* * The last thread is left attached to the process * So that the whole bundle gets recycled. Skip * all this stuff if we never had threads. * EXIT clears all sign of other threads when * it goes to single threading, so the last thread always * takes the short path. */ if (p->p_flag & P_HADTHREADS) { if (p->p_numthreads > 1) { thread_lock(td); #ifdef KSE kse_unlink(td); #else thread_unlink(td); #endif thread_unlock(td); td2 = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td2, td); /* * The test below is NOT true if we are the * sole exiting thread. P_STOPPED_SNGL is unset * in exit1() after it is the only survivor. */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { thread_lock(p->p_singlethread); thread_unsuspend_one(p->p_singlethread); thread_unlock(p->p_singlethread); } } atomic_add_int(&td->td_proc->p_exitthreads, 1); PCPU_SET(deadthread, td); } else { /* * The last thread is exiting.. but not through exit() * what should we do? * Theoretically this can't happen * exit1() - clears threading flags before coming here * kse_exit() - treats last thread specially * thr_exit() - treats last thread specially * ifdef KSE * thread_user_enter() - only if more exist * thread_userret() - only if more exist * endif * thread_suspend_check() - only if more exist */ panic ("thread_exit: Last thread exiting on its own"); } } PROC_UNLOCK(p); thread_lock(td); /* Save our tick information with both the thread and proc locked */ ruxagg(&p->p_rux, td); PROC_SUNLOCK(p); td->td_state = TDS_INACTIVE; CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td); sched_throw(td); panic("I'm a teapot!"); /* NOTREACHED */ } /* * Do any thread specific cleanups that may be needed in wait() * called with Giant, proc and schedlock not held. */ void thread_wait(struct proc *p) { struct thread *td; mtx_assert(&Giant, MA_NOTOWNED); KASSERT((p->p_numthreads == 1), ("Multiple threads in wait1()")); td = FIRST_THREAD_IN_PROC(p); #ifdef KSE if (td->td_standin != NULL) { if (td->td_standin->td_ucred != NULL) { crfree(td->td_standin->td_ucred); td->td_standin->td_ucred = NULL; } thread_free(td->td_standin); td->td_standin = NULL; } #endif /* Lock the last thread so we spin until it exits cpu_throw(). */ thread_lock(td); thread_unlock(td); /* Wait for any remaining threads to exit cpu_throw(). */ while (p->p_exitthreads) sched_relinquish(curthread); cpu_thread_clean(td); crfree(td->td_ucred); thread_reap(); /* check for zombie threads etc. */ } /* * Link a thread to a process. * set up anything that needs to be initialized for it to * be used by the process. * * Note that we do not link to the proc's ucred here. * The thread is linked as if running but no KSE assigned. * Called from: * proc_linkup() * thread_schedule_upcall() * thr_create() */ void thread_link(struct thread *td, struct proc *p) { /* * XXX This can't be enabled because it's called for proc0 before * it's spinlock has been created. * PROC_SLOCK_ASSERT(p, MA_OWNED); */ td->td_state = TDS_INACTIVE; td->td_proc = p; td->td_flags = TDF_INMEM; LIST_INIT(&td->td_contested); sigqueue_init(&td->td_sigqueue, p); callout_init(&td->td_slpcallout, CALLOUT_MPSAFE); TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist); p->p_numthreads++; } /* * Convert a process with one thread to an unthreaded process. * Called from: * thread_single(exit) (called from execve and exit) * kse_exit() XXX may need cleaning up wrt KSE stuff */ void thread_unthread(struct thread *td) { struct proc *p = td->td_proc; KASSERT((p->p_numthreads == 1), ("Unthreading with >1 threads")); #ifdef KSE thread_lock(td); upcall_remove(td); thread_unlock(td); p->p_flag &= ~(P_SA|P_HADTHREADS); td->td_mailbox = NULL; td->td_pflags &= ~(TDP_SA | TDP_CAN_UNBIND); if (td->td_standin != NULL) { thread_zombie(td->td_standin); td->td_standin = NULL; } #else p->p_flag &= ~P_HADTHREADS; #endif } /* * Called from: * thread_exit() */ void thread_unlink(struct thread *td) { struct proc *p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); TAILQ_REMOVE(&p->p_threads, td, td_plist); p->p_numthreads--; /* could clear a few other things here */ /* Must NOT clear links to proc! */ } /* * Enforce single-threading. * * Returns 1 if the caller must abort (another thread is waiting to * exit the process or similar). Process is locked! * Returns 0 when you are successfully the only thread running. * A process has successfully single threaded in the suspend mode when * There are no threads in user mode. Threads in the kernel must be * allowed to continue until they get to the user boundary. They may even * copy out their return values and data before suspending. They may however be * accelerated in reaching the user boundary as we will wake up * any sleeping threads that are interruptable. (PCATCH). */ int thread_single(int mode) { struct thread *td; struct thread *td2; struct proc *p; int remaining; td = curthread; p = td->td_proc; mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT((td != NULL), ("curthread is NULL")); if ((p->p_flag & P_HADTHREADS) == 0) return (0); /* Is someone already single threading? */ if (p->p_singlethread != NULL && p->p_singlethread != td) return (1); if (mode == SINGLE_EXIT) { p->p_flag |= P_SINGLE_EXIT; p->p_flag &= ~P_SINGLE_BOUNDARY; } else { p->p_flag &= ~P_SINGLE_EXIT; if (mode == SINGLE_BOUNDARY) p->p_flag |= P_SINGLE_BOUNDARY; else p->p_flag &= ~P_SINGLE_BOUNDARY; } p->p_flag |= P_STOPPED_SINGLE; PROC_SLOCK(p); p->p_singlethread = td; if (mode == SINGLE_EXIT) remaining = p->p_numthreads; else if (mode == SINGLE_BOUNDARY) remaining = p->p_numthreads - p->p_boundary_count; else remaining = p->p_numthreads - p->p_suspcount; while (remaining != 1) { if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE) goto stopme; FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); td2->td_flags |= TDF_ASTPENDING; if (TD_IS_INHIBITED(td2)) { switch (mode) { case SINGLE_EXIT: if (td->td_flags & TDF_DBSUSPEND) td->td_flags &= ~TDF_DBSUSPEND; if (TD_IS_SUSPENDED(td2)) thread_unsuspend_one(td2); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) sleepq_abort(td2, EINTR); break; case SINGLE_BOUNDARY: break; default: if (TD_IS_SUSPENDED(td2)) { thread_unlock(td2); continue; } /* * maybe other inhibited states too? */ if ((td2->td_flags & TDF_SINTR) && (td2->td_inhibitors & (TDI_SLEEPING | TDI_SWAPPED))) thread_suspend_one(td2); break; } } #ifdef SMP else if (TD_IS_RUNNING(td2) && td != td2) { forward_signal(td2); } #endif thread_unlock(td2); } if (mode == SINGLE_EXIT) remaining = p->p_numthreads; else if (mode == SINGLE_BOUNDARY) remaining = p->p_numthreads - p->p_boundary_count; else remaining = p->p_numthreads - p->p_suspcount; /* * Maybe we suspended some threads.. was it enough? */ if (remaining == 1) break; stopme: /* * Wake us up when everyone else has suspended. * In the mean time we suspend as well. */ thread_suspend_switch(td); if (mode == SINGLE_EXIT) remaining = p->p_numthreads; else if (mode == SINGLE_BOUNDARY) remaining = p->p_numthreads - p->p_boundary_count; else remaining = p->p_numthreads - p->p_suspcount; } if (mode == SINGLE_EXIT) { /* * We have gotten rid of all the other threads and we * are about to either exit or exec. In either case, * we try our utmost to revert to being a non-threaded * process. */ p->p_singlethread = NULL; p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT); thread_unthread(td); } PROC_SUNLOCK(p); return (0); } /* * Called in from locations that can safely check to see * whether we have to suspend or at least throttle for a * single-thread event (e.g. fork). * * Such locations include userret(). * If the "return_instead" argument is non zero, the thread must be able to * accept 0 (caller may continue), or 1 (caller must abort) as a result. * * The 'return_instead' argument tells the function if it may do a * thread_exit() or suspend, or whether the caller must abort and back * out instead. * * If the thread that set the single_threading request has set the * P_SINGLE_EXIT bit in the process flags then this call will never return * if 'return_instead' is false, but will exit. * * P_SINGLE_EXIT | return_instead == 0| return_instead != 0 *---------------+--------------------+--------------------- * 0 | returns 0 | returns 0 or 1 * | when ST ends | immediatly *---------------+--------------------+--------------------- * 1 | thread exits | returns 1 * | | immediatly * 0 = thread_exit() or suspension ok, * other = return error instead of stopping the thread. * * While a full suspension is under effect, even a single threading * thread would be suspended if it made this call (but it shouldn't). * This call should only be made from places where * thread_exit() would be safe as that may be the outcome unless * return_instead is set. */ int thread_suspend_check(int return_instead) { struct thread *td; struct proc *p; td = curthread; p = td->td_proc; mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); while (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) && (td->td_flags & TDF_DBSUSPEND))) { if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { KASSERT(p->p_singlethread != NULL, ("singlethread not set")); /* * The only suspension in action is a * single-threading. Single threader need not stop. * XXX Should be safe to access unlocked * as it can only be set to be true by us. */ if (p->p_singlethread == td) return (0); /* Exempt from stopping. */ } if ((p->p_flag & P_SINGLE_EXIT) && return_instead) return (EINTR); /* Should we goto user boundary if we didn't come from there? */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && (p->p_flag & P_SINGLE_BOUNDARY) && return_instead) return (ERESTART); /* If thread will exit, flush its pending signals */ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) sigqueue_flush(&td->td_sigqueue); PROC_SLOCK(p); thread_stopped(p); /* * If the process is waiting for us to exit, * this thread should just suicide. * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE. */ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) thread_exit(); if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount + 1) { thread_lock(p->p_singlethread); thread_unsuspend_one(p->p_singlethread); thread_unlock(p->p_singlethread); } } PROC_UNLOCK(p); thread_lock(td); /* * When a thread suspends, it just * gets taken off all queues. */ thread_suspend_one(td); if (return_instead == 0) { p->p_boundary_count++; td->td_flags |= TDF_BOUNDARY; } PROC_SUNLOCK(p); mi_switch(SW_INVOL, NULL); if (return_instead == 0) td->td_flags &= ~TDF_BOUNDARY; thread_unlock(td); PROC_LOCK(p); if (return_instead == 0) p->p_boundary_count--; } return (0); } void thread_suspend_switch(struct thread *td) { struct proc *p; p = td->td_proc; KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); /* * We implement thread_suspend_one in stages here to avoid * dropping the proc lock while the thread lock is owned. */ thread_stopped(p); p->p_suspcount++; PROC_UNLOCK(p); thread_lock(td); sched_sleep(td); TD_SET_SUSPENDED(td); PROC_SUNLOCK(p); DROP_GIANT(); mi_switch(SW_VOL, NULL); thread_unlock(td); PICKUP_GIANT(); PROC_LOCK(p); PROC_SLOCK(p); } void thread_suspend_one(struct thread *td) { struct proc *p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); p->p_suspcount++; sched_sleep(td); TD_SET_SUSPENDED(td); } void thread_unsuspend_one(struct thread *td) { struct proc *p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended")); TD_CLR_SUSPENDED(td); p->p_suspcount--; setrunnable(td); } /* * Allow all threads blocked by single threading to continue running. */ void thread_unsuspend(struct proc *p) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); if (!P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { thread_unsuspend_one(td); } thread_unlock(td); } } else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) && (p->p_numthreads == p->p_suspcount)) { /* * Stopping everything also did the job for the single * threading request. Now we've downgraded to single-threaded, * let it continue. */ thread_lock(p->p_singlethread); thread_unsuspend_one(p->p_singlethread); thread_unlock(p->p_singlethread); } } /* * End the single threading mode.. */ void thread_single_end(void) { struct thread *td; struct proc *p; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY); PROC_SLOCK(p); p->p_singlethread = NULL; /* * If there are other threads they mey now run, * unless of course there is a blanket 'stop order' * on the process. The single threader must be allowed * to continue however as this is a bad place to stop. */ if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { thread_unsuspend_one(td); } thread_unlock(td); } } PROC_SUNLOCK(p); } struct thread * thread_find(struct proc *p, lwpid_t tid) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { if (td->td_tid == tid) break; } PROC_SUNLOCK(p); return (td); } Index: head/sys/pc98/pc98/machdep.c =================================================================== --- head/sys/pc98/pc98/machdep.c (revision 173360) +++ head/sys/pc98/pc98/machdep.c (revision 173361) @@ -1,2800 +1,2800 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_atalk.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_ipx.h" #include "opt_isa.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_msgbuf.h" #include "opt_npx.h" #include "opt_perfmon.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #ifndef KDB #error KDB must be enabled in order for DDB to work! #endif #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PERFMON #include #endif #ifdef SMP #include #include #endif #ifdef DEV_ISA #include #endif /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); extern void init386(int first); extern void dblfault_handler(void); extern void printcpuinfo(void); /* XXX header file */ extern void finishidentcpu(void); extern void panicifcpuunsupported(void); extern void initializecpu(void); #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) #define CPU_ENABLE_SSE #endif static void cpu_startup(void *); static void fpstate_drop(struct thread *td); static void get_fpcontext(struct thread *td, mcontext_t *mcp); static int set_fpcontext(struct thread *td, const mcontext_t *mcp); #ifdef CPU_ENABLE_SSE static void set_fpregs_xmm(struct save87 *, struct savexmm *); static void fill_fpregs_xmm(struct savexmm *, struct save87 *); #endif /* CPU_ENABLE_SSE */ SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) int need_pre_dma_flush; /* If 1, use wbinvd befor DMA transfer. */ int need_post_dma_flush; /* If 1, use invd after DMA transfer. */ #ifdef DDB extern vm_offset_t ksym_start, ksym_end; #endif int _udatasel, _ucodesel; u_int basemem; static int ispc98 = 1; SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, ""); int cold = 1; #ifdef COMPAT_43 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); #endif #ifdef COMPAT_FREEBSD4 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); #endif long Maxmem = 0; long realmem = 0; /* * The number of PHYSMAP entries must be one less than the number of * PHYSSEG entries because the PHYSMAP entry that spans the largest * physical address that is accessible by ISA DMA is split into two * PHYSSEG entries. */ #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2) #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2) struct kva_md_info kmi; static struct trapframe proc0_tf; #ifndef SMP static struct pcpu __pcpu; #endif struct mtx icu_lock; struct mem_range_softc mem_range_softc; static void cpu_startup(dummy) void *dummy; { /* * Good {morning,afternoon,evening,night}. */ startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif printf("real memory = %ju (%ju MB)\n", ptoa((uintmax_t)Maxmem), ptoa((uintmax_t)Maxmem) / 1048576); realmem = Maxmem; /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size; size = phys_avail[indx + 1] - phys_avail[indx]; printf( "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", (uintmax_t)phys_avail[indx], (uintmax_t)phys_avail[indx + 1] - 1, (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", ptoa((uintmax_t)cnt.v_free_count), ptoa((uintmax_t)cnt.v_free_count) / 1048576); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); cpu_setregs(); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ #ifdef COMPAT_43 static void osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct osigframe sf, *fp; struct proc *p; struct thread *td; struct sigacts *psp; struct trapframe *regs; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct osigframe *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct osigframe)); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else fp = (struct osigframe *)regs->tf_esp - 1; /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_arg2 = (register_t)&fp->sf_siginfo; sf.sf_siginfo.si_signo = sig; sf.sf_siginfo.si_code = ksi->ksi_code; sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; } else { /* Old FreeBSD-style arguments. */ sf.sf_arg2 = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Save most if not all of trap frame. */ sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; sf.sf_siginfo.si_sc.sc_es = regs->tf_es; sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; sf.sf_siginfo.si_sc.sc_gs = rgs(); sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; /* Build the signal context to be used by osigreturn(). */ sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; sf.sf_siginfo.si_sc.sc_err = regs->tf_err; /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_siginfo.si_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* See sendsig() for comments. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(*fp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_esp = (int)fp; regs->tf_eip = PS_STRINGS - szosigcode; regs->tf_eflags &= ~PSL_T; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; load_gs(_udatasel); regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #endif /* COMPAT_43 */ #ifdef COMPAT_FREEBSD4 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe4 sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; struct trapframe *regs; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_gs = rgs(); bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct sigframe4)); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sfp = (struct sigframe4 *)regs->tf_esp - 1; /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (register_t)&sfp->sf_si; sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si.si_signo = sig; sf.sf_si.si_code = ksi->ksi_code; sf.sf_si.si_addr = ksi->ksi_addr; } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_uc.uc_mcontext.mc_eflags = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_esp = (int)sfp; regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode; regs->tf_eflags &= ~PSL_T; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #endif /* COMPAT_FREEBSD4 */ void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; char *sp; struct trapframe *regs; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); #ifdef COMPAT_FREEBSD4 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { freebsd4_sendsig(catcher, ksi, mask); return; } #endif #ifdef COMPAT_43 if (SIGISMEMBER(psp->ps_osigset, sig)) { osendsig(catcher, ksi, mask); return; } #endif regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_gs = rgs(); bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ get_fpcontext(td, &sf.sf_uc.uc_mcontext); fpstate_drop(td); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct sigframe); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sp = (char *)regs->tf_esp - sizeof(struct sigframe); /* Align to 16 bytes. */ sfp = (struct sigframe *)((unsigned int)sp & ~0xF); /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (register_t)&sfp->sf_si; sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_uc.uc_mcontext.mc_eflags = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_esp = (int)sfp; regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); regs->tf_eflags &= ~PSL_T; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. * * MPSAFE */ #ifdef COMPAT_43 int osigreturn(td, uap) struct thread *td; struct osigreturn_args /* { struct osigcontext *sigcntxp; } */ *uap; { struct osigcontext sc; struct trapframe *regs; struct osigcontext *scp; struct proc *p = td->td_proc; int eflags, error; ksiginfo_t ksi; regs = td->td_frame; error = copyin(uap->sigcntxp, &sc, sizeof(sc)); if (error != 0) return (error); scp = ≻ eflags = scp->sc_ps; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (td->td_pcb->pcb_ext == 0) return (EINVAL); vm86 = &td->td_pcb->pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); } if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } tf->tf_vm86_ds = scp->sc_ds; tf->tf_vm86_es = scp->sc_es; tf->tf_vm86_fs = scp->sc_fs; tf->tf_vm86_gs = scp->sc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ if (!CS_SECURE(scp->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } regs->tf_ds = scp->sc_ds; regs->tf_es = scp->sc_es; regs->tf_fs = scp->sc_fs; } /* Restore remaining registers. */ regs->tf_eax = scp->sc_eax; regs->tf_ebx = scp->sc_ebx; regs->tf_ecx = scp->sc_ecx; regs->tf_edx = scp->sc_edx; regs->tf_esi = scp->sc_esi; regs->tf_edi = scp->sc_edi; regs->tf_cs = scp->sc_cs; regs->tf_ss = scp->sc_ss; regs->tf_isp = scp->sc_isp; regs->tf_ebp = scp->sc_fp; regs->tf_esp = scp->sc_sp; regs->tf_eip = scp->sc_pc; regs->tf_eflags = eflags; PROC_LOCK(p); #if defined(COMPAT_43) if (scp->sc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif SIGSETOLD(td->td_sigmask, scp->sc_mask); SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); return (EJUSTRETURN); } #endif /* COMPAT_43 */ #ifdef COMPAT_FREEBSD4 /* * MPSAFE */ int freebsd4_sigreturn(td, uap) struct thread *td; struct freebsd4_sigreturn_args /* { const ucontext4 *sigcntxp; } */ *uap; { struct ucontext4 uc; struct proc *p = td->td_proc; struct trapframe *regs; const struct ucontext4 *ucp; int cs, eflags, error; ksiginfo_t ksi; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) return (error); ucp = &uc; regs = td->td_frame; eflags = ucp->uc_mcontext.mc_eflags; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (td->td_pcb->pcb_ext == 0) return (EINVAL); vm86 = &td->td_pcb->pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); } if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); tf->tf_eflags = eflags; tf->tf_vm86_ds = tf->tf_ds; tf->tf_vm86_es = tf->tf_es; tf->tf_vm86_fs = tf->tf_fs; tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { printf("freebsd4_sigreturn: eflags = 0x%x\n", eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { printf("freebsd4_sigreturn: cs = 0x%x\n", cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); } PROC_LOCK(p); #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif td->td_sigmask = ucp->uc_sigmask; SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); return (EJUSTRETURN); } #endif /* COMPAT_FREEBSD4 */ /* * MPSAFE */ int sigreturn(td, uap) struct thread *td; struct sigreturn_args /* { const struct __ucontext *sigcntxp; } */ *uap; { ucontext_t uc; struct proc *p = td->td_proc; struct trapframe *regs; const ucontext_t *ucp; int cs, eflags, error, ret; ksiginfo_t ksi; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) return (error); ucp = &uc; regs = td->td_frame; eflags = ucp->uc_mcontext.mc_eflags; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (td->td_pcb->pcb_ext == 0) return (EINVAL); vm86 = &td->td_pcb->pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); } if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); tf->tf_eflags = eflags; tf->tf_vm86_ds = tf->tf_ds; tf->tf_vm86_es = tf->tf_es; tf->tf_vm86_fs = tf->tf_fs; tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { printf("sigreturn: eflags = 0x%x\n", eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { printf("sigreturn: cs = 0x%x\n", cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } ret = set_fpcontext(td, &ucp->uc_mcontext); if (ret != 0) return (ret); bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); } PROC_LOCK(p); #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif td->td_sigmask = ucp->uc_sigmask; SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); return (EJUSTRETURN); } /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { register_t reg; uint64_t tsc1, tsc2; if (pcpu_find(cpu_id) == NULL || rate == NULL) return (EINVAL); if (!tsc_present) return (EOPNOTSUPP); /* If we're booting, trust the rate calibrated moments ago. */ if (cold) { *rate = tsc_freq; return (0); } #ifdef SMP /* Schedule ourselves on the indicated cpu. */ thread_lock(curthread); sched_bind(curthread, cpu_id); thread_unlock(curthread); #endif /* Calibrate by measuring a short delay. */ reg = intr_disable(); tsc1 = rdtsc(); DELAY(1000); tsc2 = rdtsc(); intr_restore(reg); #ifdef SMP thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); #endif /* * Calculate the difference in readings, convert to Mhz, and * subtract 0.5% of the total. Empirical testing has shown that * overhead in DELAY() works out to approximately this value. */ tsc2 -= tsc1; *rate = tsc2 * 1000 - tsc2 * 5; return (0); } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) __asm__ ("hlt"); } /* * Hook to idle the CPU when possible. In the SMP case we default to * off because a halted cpu will not currently pick up a new thread in the * run queue until the next timer tick. If turned on this will result in * approximately a 4.2% loss in real time performance in buildworld tests * (but improves user and sys times oddly enough), and saves approximately * 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3). * * XXX we need to have a cpu mask of idle cpus and generate an IPI or * otherwise generate some sort of interrupt to wake up cpus sitting in HLT. * Then we can have our cake and eat it too. * * XXX I'm turning it on for SMP as well by default for now. It seems to * help lock contention somewhat, and this is critical for HTT. -Peter */ static int cpu_idle_hlt = 1; TUNABLE_INT("machdep.cpu_idle_hlt", &cpu_idle_hlt); SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, &cpu_idle_hlt, 0, "Idle loop HLT enable"); static void cpu_idle_default(void) { /* * we must absolutely guarentee that hlt is the * absolute next instruction after sti or we * introduce a timing window. */ __asm __volatile("sti; hlt"); } /* * Note that we have to be careful here to avoid a race between checking * sched_runnable() and actually halting. If we don't do this, we may waste * the time between calling hlt and the next interrupt even though there * is a runnable process. */ void cpu_idle(void) { #ifdef SMP if (mp_grab_cpu_hlt()) return; #endif if (cpu_idle_hlt) { disable_intr(); if (sched_runnable()) enable_intr(); else (*cpu_idle_hook)(); } } /* Other subsystems (e.g., ACPI) can hook this later. */ void (*cpu_idle_hook)(void) = cpu_idle_default; /* * Clear registers on exec */ void exec_setregs(td, entry, stack, ps_strings) struct thread *td; u_long entry; u_long stack; u_long ps_strings; { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ pcb->pcb_gs = _udatasel; load_gs(_udatasel); mtx_lock_spin(&dt_lock); if (td->td_proc->p_md.md_ldt) user_ldt_free(td); else mtx_unlock_spin(&dt_lock); bzero((char *)regs, sizeof(struct trapframe)); regs->tf_eip = entry; regs->tf_esp = stack; regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_cs = _ucodesel; /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ regs->tf_ebx = ps_strings; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == PCPU_GET(curpcb)) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } pcb->pcb_flags &= ~PCB_DBREGS; } /* * Initialize the math emulator (if any) for the current process. * Actually, just clear the bit that says that the emulator has * been initialized. Initialization is delayed until the process * traps to the emulator (if it is done at all) mainly because * emulators don't provide an entry point for initialization. */ td->td_pcb->pcb_flags &= ~FP_SOFTFP; /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); /* * XXX - Linux emulator * Make sure sure edx is 0x0 on entry. Linux binaries depend * on it. */ td->td_retval[1] = 0; } void cpu_setregs(void) { unsigned int cr0; cr0 = rcr0(); /* * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: * * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT * instructions. We must set the CR0_MP bit and use the CR0_TS * bit to control the trap, because setting the CR0_EM bit does * not cause WAIT instructions to trap. It's important to trap * WAIT instructions - otherwise the "wait" variants of no-wait * control instructions would degenerate to the "no-wait" variants * after FP context switches but work correctly otherwise. It's * particularly important to trap WAITs when there is no NPX - * otherwise the "wait" variants would always degenerate. * * Try setting CR0_NE to get correct error reporting on 486DX's. * Setting it should fail or do nothing on lesser processors. */ cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; load_cr0(cr0); load_gs(_udatasel); } u_long bootdev; /* not a struct cdev *- encoding is different */ SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int _default_ldt; union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ struct region_descriptor r_gdt, r_idt; /* table descriptors */ struct mtx dt_lock; /* lock for GDT and LDT */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; #endif static struct i386tss dblfault_tss; static char dblfault_stack[PAGE_SIZE]; extern vm_offset_t proc0kstack; /* * software prototypes -- in more palatable form. * * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GUFS_SEL 2 %fs Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GUGS_SEL 3 %gs Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GCODE_SEL 4 Code Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GDATA_SEL 5 Data Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GUCODE_SEL 6 Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GUDATA_SEL 7 Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ { 0x400, /* segment base address */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { 0x0, /* segment base address */ sizeof(struct i386tss)-1,/* length */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GLDT_SEL 10 LDT Descriptor */ { (int) ldt, /* segment base address */ sizeof(ldt)-1, /* length - all address space */ SDT_SYSLDT, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GUSERLDT_SEL 11 User LDT Descriptor per process */ { (int) ldt, /* segment base address */ (512 * sizeof(union descriptor)-1), /* length */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GPANIC_SEL 12 Panic Tss Descriptor */ { (int) &dblfault_tss, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GNDIS_SEL 18 NDIS Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; void setidt(idx, func, typ, dpl, selec) int idx; inthand_t *func; int typ; int dpl; int selec; { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); #ifdef DDB /* * Display the index and function name of any IDT entries that don't use * the default 'rsvd' entry point. */ DB_SHOW_COMMAND(idt, db_show_idt) { struct gate_descriptor *ip; int idx; uintptr_t func; ip = idt; for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { func = (ip->gd_hioffset << 16 | ip->gd_looffset); if (func != (uintptr_t)&IDTVEC(rsvd)) { db_printf("%3d\t", idx); db_printsym(func, DB_STGY_PROC); db_printf("\n"); } ip++; } } #endif void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * If we cannot accurately determine the physical memory map, then use * value from the 0xE801 call, and failing that, the RTC. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * XXX first should be vm_paddr_t. */ static void getmemsize(int first) { int i, off, physmap_idx, pa_indx, da_indx; int pg_n; u_long physmem_tunable; u_int extmem, under16; vm_paddr_t pa, physmap[PHYSMAP_SIZE]; pt_entry_t *pte; quad_t dcons_addr, dcons_size; bzero(physmap, sizeof(physmap)); /* XXX - some of EPSON machines can't use PG_N */ pg_n = PG_N; if (pc98_machine_type & M_EPSON_PC98) { switch (epson_machine_id) { #ifdef WB_CACHE default: #endif case EPSON_PC486_HX: case EPSON_PC486_HG: case EPSON_PC486_HA: pg_n = 0; break; } } /* * Perform "base memory" related probes & setup */ under16 = pc98_getmemsize(&basemem, &extmem); if (basemem > 640) { printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", basemem); basemem = 640; } /* * XXX if biosbasemem is now < 640, there is a `hole' * between the end of base memory and the start of * ISA memory. The hole may be empty or it may * contain BIOS code or data. Map it read/write so * that the BIOS can write to it. (Memory from 0 to * the physical end of the kernel is mapped read-only * to begin with and then parts of it are remapped. * The parts that aren't remapped form holes that * remain read-only and are unused by the kernel. * The base memory area is below the physical end of * the kernel and right now forms a read-only hole. * The part of it from PAGE_SIZE to * (trunc_page(biosbasemem * 1024) - 1) will be * remapped and used by the kernel later.) * * This code is similar to the code used in * pmap_mapdev, but since no memory needs to be * allocated we simply change the mapping. */ for (pa = trunc_page(basemem * 1024); pa < ISA_HOLE_START; pa += PAGE_SIZE) pmap_kenter(KERNBASE + pa, pa); /* * if basemem != 640, map pages r/w into vm86 page table so * that the bios can scribble on it. */ pte = (pt_entry_t *)vm86paddr; for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; physmap[0] = 0; physmap[1] = basemem * 1024; physmap_idx = 2; physmap[physmap_idx] = 0x100000; physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; /* * Now, physmap contains a map of physical memory. */ #ifdef SMP /* make hole for AP bootstrap code */ physmap[1] = mp_bootaddress(physmap[1]); #endif /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) Maxmem = atop(physmem_tunable); if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %ldK\n", Maxmem * 4); /* * If Maxmem has been increased beyond what the system has detected, * extend the last memory segment to the new limit. */ if (atop(physmap[physmap_idx + 1]) < Maxmem) physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); /* * We need to divide chunk if Maxmem is larger than 16MB and * under 16MB area is not full of memory. * (1) system area (15-16MB region) is cut off * (2) extended memory is only over 16MB area (ex. Melco "HYPERMEMORY") */ if ((under16 != 16 * 1024) && (extmem > 15 * 1024)) { /* 15M - 16M region is cut off, so need to divide chunk */ physmap[physmap_idx + 1] = under16 * 1024; physmap_idx += 2; physmap[physmap_idx] = 0x1000000; physmap[physmap_idx + 1] = physmap[2] + extmem * 1024; } /* call pmap initialization to make new kernel address space */ pmap_bootstrap(first); /* * Size up each available chunk of physical memory. */ physmap[0] = PAGE_SIZE; /* mask off page 0 */ pa_indx = 0; da_indx = 1; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; dump_avail[da_indx] = physmap[0]; pte = CMAP1; /* * Get dcons buffer address */ if (getenv_quad("dcons.addr", &dcons_addr) == 0 || getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ for (i = 0; i <= physmap_idx; i += 2) { vm_paddr_t end; end = ptoa((vm_paddr_t)Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad, full; int *ptr = (int *)CADDR1; full = FALSE; /* * block out kernel memory as not available. */ if (pa >= KERNLOAD && pa < first) goto do_dump_avail; /* * block out dcons buffer */ if (dcons_addr > 0 && pa >= trunc_page(dcons_addr) && pa < dcons_addr + dcons_size) goto do_dump_avail; page_bad = FALSE; /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | pg_n; invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) page_bad = TRUE; /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) page_bad = TRUE; /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) page_bad = TRUE; /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) page_bad = TRUE; /* * Restore original value. */ *(int *)ptr = tmp; /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) continue; /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; full = TRUE; goto do_dump_avail; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; do_dump_avail: if (dump_avail[da_indx] == pa) { dump_avail[da_indx] += PAGE_SIZE; } else { da_indx++; if (da_indx == DUMP_AVAIL_ARRAY_END) { da_indx--; goto do_next; } dump_avail[da_indx++] = pa; /* start */ dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ } do_next: if (full) break; } } *pte = 0; invltlb(); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); /* Map the message buffer. */ for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + off); } void init386(first) int first; { struct gate_descriptor *gdp; int gsel_tss, metadata_missing, x; struct pcpu *pc; thread0.td_kstack = proc0kstack; thread0.td_pcb = (struct pcb *) (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; /* * This may be done better later if it gets more high level * components in it. If so just link td->td_proc here. */ - proc_linkup(&proc0, &thread0); + proc_linkup0(&proc0, &thread0); /* * Initialize DMAC */ pc98_init_dmac(); metadata_missing = 0; if (bootinfo.bi_modulep) { preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; preload_bootstrap_relocate(KERNBASE); } else { metadata_missing = 1; } if (envmode == 1) kern_envp = static_env; else if (bootinfo.bi_envp) kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; /* Init basic tunables, hz etc */ init_param1(); /* * Make gdt memory segments. All segments cover the full 4GB * of address space and permissions are enforced at page level. */ gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); #ifdef SMP pc = &SMP_prvspace[0].pcpu; #else pc = &__pcpu; #endif gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); gdt_segs[GPRIV_SEL].ssd_base = (int) pc; gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; for (x = 0; x < NGDT; x++) ssdtosd(&gdt_segs[x], &gdt[x].sd); r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) gdt; mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); lgdt(&r_gdt); pcpu_init(pc, 0, sizeof(struct pcpu)); PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); PCPU_SET(curpcb, thread0.td_pcb); /* * Initialize mutexes. * * icu_lock: in order to allow an interrupt to occur in a critical * section, to set pcpu->ipending (etc...) properly, we * must be able to get the icu lock, so it can't be * under witness. */ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); /* make ldt memory segments */ ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL , GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); /* * Initialize the i8254 before the console so that console * initialization can use DELAY(). */ i8254_init(); /* * Initialize the console before we print anything out. */ cninit(); if (metadata_missing) printf("WARNING: loader(8) metadata is missing!\n"); #ifdef DEV_ISA atpic_startup(); #endif #ifdef DDB ksym_start = bootinfo.bi_symtab; ksym_end = bootinfo.bi_esymtab; #endif kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter("Boot flags requested debugger"); #endif finishidentcpu(); /* Final stage of CPU initialization */ setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); initializecpu(); /* Initialize CPU registers */ /* make an initial tss so cpu can get interrupt stack on syscall! */ /* Note: -16 is so we can grow the trapframe if we came from vm86 */ PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb) - 16); PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); ltr(gsel_tss); /* pointer to selector slot for %fs/%gs */ PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_cr3 = (int)IdlePTD; dblfault_tss.tss_eip = (int)dblfault_handler; dblfault_tss.tss_eflags = PSL_KERNEL; dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); vm86_initialize(); getmemsize(first); init_param2(physmem); /* now running on new page tables, configured,and u/iom is accessible */ msgbufinit(msgbufp, MSGBUF_SIZE); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(lcall_syscall); gdp->gd_looffset = x; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = x >> 16; /* XXX does this work? */ /* XXX yes! */ ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; /* transfer to user mode */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); _udatasel = GSEL(GUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; /* XXXKSE */ thread0.td_pcb->pcb_cr3 = (int)IdlePTD; thread0.td_pcb->pcb_ext = 0; thread0.td_frame = &proc0_tf; } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { } void spinlock_enter(void) { struct thread *td; td = curthread; if (td->td_md.md_spinlock_count == 0) td->td_md.md_saved_flags = intr_disable(); td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; td = curthread; critical_exit(); td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) intr_restore(td->td_md.md_saved_flags); } #if defined(I586_CPU) && !defined(NO_F00F_HACK) static void f00f_hack(void *unused); SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL) static void f00f_hack(void *unused) { struct gate_descriptor *new_idt; vm_offset_t tmp; if (!has_f00f_bug) return; GIANT_REQUIRED; printf("Intel Pentium detected, installing workaround for F00F bug\n"); tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2); if (tmp == 0) panic("kmem_alloc returned 0"); /* Put the problematic entry (#6) at the end of the lower page. */ new_idt = (struct gate_descriptor*) (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); bcopy(idt, new_idt, sizeof(idt0)); r_idt.rd_base = (u_int)new_idt; lidt(&r_idt); idt = new_idt; if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE, VM_PROT_READ, FALSE) != KERN_SUCCESS) panic("vm_map_protect failed"); } #endif /* defined(I586_CPU) && !NO_F00F_HACK */ /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_edi = tf->tf_edi; pcb->pcb_esi = tf->tf_esi; pcb->pcb_ebp = tf->tf_ebp; pcb->pcb_ebx = tf->tf_ebx; pcb->pcb_eip = tf->tf_eip; pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; } int ptrace_set_pc(struct thread *td, u_long addr) { td->td_frame->tf_eip = addr; return (0); } int ptrace_single_step(struct thread *td) { td->td_frame->tf_eflags |= PSL_T; return (0); } int ptrace_clear_single_step(struct thread *td) { td->td_frame->tf_eflags &= ~PSL_T; return (0); } int fill_regs(struct thread *td, struct reg *regs) { struct pcb *pcb; struct trapframe *tp; tp = td->td_frame; pcb = td->td_pcb; regs->r_fs = tp->tf_fs; regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; regs->r_gs = pcb->pcb_gs; return (0); } int set_regs(struct thread *td, struct reg *regs) { struct pcb *pcb; struct trapframe *tp; tp = td->td_frame; if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); pcb = td->td_pcb; tp->tf_fs = regs->r_fs; tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; pcb->pcb_gs = regs->r_gs; return (0); } #ifdef CPU_ENABLE_SSE static void fill_fpregs_xmm(sv_xmm, sv_87) struct savexmm *sv_xmm; struct save87 *sv_87; { register struct env87 *penv_87 = &sv_87->sv_env; register struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; bzero(sv_87, sizeof(*sv_87)); /* FPU control/status */ penv_87->en_cw = penv_xmm->en_cw; penv_87->en_sw = penv_xmm->en_sw; penv_87->en_tw = penv_xmm->en_tw; penv_87->en_fip = penv_xmm->en_fip; penv_87->en_fcs = penv_xmm->en_fcs; penv_87->en_opcode = penv_xmm->en_opcode; penv_87->en_foo = penv_xmm->en_foo; penv_87->en_fos = penv_xmm->en_fos; /* FPU registers */ for (i = 0; i < 8; ++i) sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; } static void set_fpregs_xmm(sv_87, sv_xmm) struct save87 *sv_87; struct savexmm *sv_xmm; { register struct env87 *penv_87 = &sv_87->sv_env; register struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* FPU control/status */ penv_xmm->en_cw = penv_87->en_cw; penv_xmm->en_sw = penv_87->en_sw; penv_xmm->en_tw = penv_87->en_tw; penv_xmm->en_fip = penv_87->en_fip; penv_xmm->en_fcs = penv_87->en_fcs; penv_xmm->en_opcode = penv_87->en_opcode; penv_xmm->en_foo = penv_87->en_foo; penv_xmm->en_fos = penv_87->en_fos; /* FPU registers */ for (i = 0; i < 8; ++i) sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; } #endif /* CPU_ENABLE_SSE */ int fill_fpregs(struct thread *td, struct fpreg *fpregs) { #ifdef CPU_ENABLE_SSE if (cpu_fxsr) { fill_fpregs_xmm(&td->td_pcb->pcb_save.sv_xmm, (struct save87 *)fpregs); return (0); } #endif /* CPU_ENABLE_SSE */ bcopy(&td->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs); return (0); } int set_fpregs(struct thread *td, struct fpreg *fpregs) { #ifdef CPU_ENABLE_SSE if (cpu_fxsr) { set_fpregs_xmm((struct save87 *)fpregs, &td->td_pcb->pcb_save.sv_xmm); return (0); } #endif /* CPU_ENABLE_SSE */ bcopy(fpregs, &td->td_pcb->pcb_save.sv_87, sizeof *fpregs); return (0); } /* * Get machine context. */ int get_mcontext(struct thread *td, mcontext_t *mcp, int flags) { struct trapframe *tp; tp = td->td_frame; PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(tp->tf_esp); PROC_UNLOCK(curthread->td_proc); mcp->mc_gs = td->td_pcb->pcb_gs; mcp->mc_fs = tp->tf_fs; mcp->mc_es = tp->tf_es; mcp->mc_ds = tp->tf_ds; mcp->mc_edi = tp->tf_edi; mcp->mc_esi = tp->tf_esi; mcp->mc_ebp = tp->tf_ebp; mcp->mc_isp = tp->tf_isp; mcp->mc_eflags = tp->tf_eflags; if (flags & GET_MC_CLEAR_RET) { mcp->mc_eax = 0; mcp->mc_edx = 0; mcp->mc_eflags &= ~PSL_C; } else { mcp->mc_eax = tp->tf_eax; mcp->mc_edx = tp->tf_edx; } mcp->mc_ebx = tp->tf_ebx; mcp->mc_ecx = tp->tf_ecx; mcp->mc_eip = tp->tf_eip; mcp->mc_cs = tp->tf_cs; mcp->mc_esp = tp->tf_esp; mcp->mc_ss = tp->tf_ss; mcp->mc_len = sizeof(*mcp); get_fpcontext(td, mcp); return (0); } /* * Set machine context. * * However, we don't set any but the user modifiable flags, and we won't * touch the cs selector. */ int set_mcontext(struct thread *td, const mcontext_t *mcp) { struct trapframe *tp; int eflags, ret; tp = td->td_frame; if (mcp->mc_len != sizeof(*mcp)) return (EINVAL); eflags = (mcp->mc_eflags & PSL_USERCHANGE) | (tp->tf_eflags & ~PSL_USERCHANGE); if ((ret = set_fpcontext(td, mcp)) == 0) { tp->tf_fs = mcp->mc_fs; tp->tf_es = mcp->mc_es; tp->tf_ds = mcp->mc_ds; tp->tf_edi = mcp->mc_edi; tp->tf_esi = mcp->mc_esi; tp->tf_ebp = mcp->mc_ebp; tp->tf_ebx = mcp->mc_ebx; tp->tf_edx = mcp->mc_edx; tp->tf_ecx = mcp->mc_ecx; tp->tf_eax = mcp->mc_eax; tp->tf_eip = mcp->mc_eip; tp->tf_eflags = eflags; tp->tf_esp = mcp->mc_esp; tp->tf_ss = mcp->mc_ss; td->td_pcb->pcb_gs = mcp->mc_gs; ret = 0; } return (ret); } static void get_fpcontext(struct thread *td, mcontext_t *mcp) { #ifndef DEV_NPX mcp->mc_fpformat = _MC_FPFMT_NODEV; mcp->mc_ownedfp = _MC_FPOWNED_NONE; #else union savefpu *addr; /* * XXX mc_fpstate might be misaligned, since its declaration is not * unportabilized using __attribute__((aligned(16))) like the * declaration of struct savemm, and anyway, alignment doesn't work * for auto variables since we don't use gcc's pessimal stack * alignment. Work around this by abusing the spare fields after * mcp->mc_fpstate. * * XXX unpessimize most cases by only aligning when fxsave might be * called, although this requires knowing too much about * npxgetregs()'s internals. */ addr = (union savefpu *)&mcp->mc_fpstate; if (td == PCPU_GET(fpcurthread) && #ifdef CPU_ENABLE_SSE cpu_fxsr && #endif ((uintptr_t)(void *)addr & 0xF)) { do addr = (void *)((char *)addr + 4); while ((uintptr_t)(void *)addr & 0xF); } mcp->mc_ownedfp = npxgetregs(td, addr); if (addr != (union savefpu *)&mcp->mc_fpstate) { bcopy(addr, &mcp->mc_fpstate, sizeof(mcp->mc_fpstate)); bzero(&mcp->mc_spare2, sizeof(mcp->mc_spare2)); } mcp->mc_fpformat = npxformat(); #endif } static int set_fpcontext(struct thread *td, const mcontext_t *mcp) { union savefpu *addr; if (mcp->mc_fpformat == _MC_FPFMT_NODEV) return (0); else if (mcp->mc_fpformat != _MC_FPFMT_387 && mcp->mc_fpformat != _MC_FPFMT_XMM) return (EINVAL); else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) /* We don't care what state is left in the FPU or PCB. */ fpstate_drop(td); else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || mcp->mc_ownedfp == _MC_FPOWNED_PCB) { /* XXX align as above. */ addr = (union savefpu *)&mcp->mc_fpstate; if (td == PCPU_GET(fpcurthread) && #ifdef CPU_ENABLE_SSE cpu_fxsr && #endif ((uintptr_t)(void *)addr & 0xF)) { do addr = (void *)((char *)addr + 4); while ((uintptr_t)(void *)addr & 0xF); bcopy(&mcp->mc_fpstate, addr, sizeof(mcp->mc_fpstate)); } #ifdef DEV_NPX #ifdef CPU_ENABLE_SSE if (cpu_fxsr) addr->sv_xmm.sv_env.en_mxcsr &= cpu_mxcsr_mask; #endif /* * XXX we violate the dubious requirement that npxsetregs() * be called with interrupts disabled. */ npxsetregs(td, addr); #endif /* * Don't bother putting things back where they were in the * misaligned case, since we know that the caller won't use * them again. */ } else return (EINVAL); return (0); } static void fpstate_drop(struct thread *td) { register_t s; s = intr_disable(); #ifdef DEV_NPX if (PCPU_GET(fpcurthread) == td) npxdrop(); #endif /* * XXX force a full drop of the npx. The above only drops it if we * owned it. npxgetregs() has the same bug in the !cpu_fxsr case. * * XXX I don't much like npxgetregs()'s semantics of doing a full * drop. Dropping only to the pcb matches fnsave's behaviour. * We only need to drop to !PCB_INITDONE in sendsig(). But * sendsig() is the only caller of npxgetregs()... perhaps we just * have too many layers. */ curthread->td_pcb->pcb_flags &= ~PCB_NPXINITDONE; intr_restore(s); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; if (td == NULL) { dbregs->dr[0] = rdr0(); dbregs->dr[1] = rdr1(); dbregs->dr[2] = rdr2(); dbregs->dr[3] = rdr3(); dbregs->dr[4] = rdr4(); dbregs->dr[5] = rdr5(); dbregs->dr[6] = rdr6(); dbregs->dr[7] = rdr7(); } else { pcb = td->td_pcb; dbregs->dr[0] = pcb->pcb_dr0; dbregs->dr[1] = pcb->pcb_dr1; dbregs->dr[2] = pcb->pcb_dr2; dbregs->dr[3] = pcb->pcb_dr3; dbregs->dr[4] = 0; dbregs->dr[5] = 0; dbregs->dr[6] = pcb->pcb_dr6; dbregs->dr[7] = pcb->pcb_dr7; } return (0); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; int i; if (td == NULL) { load_dr0(dbregs->dr[0]); load_dr1(dbregs->dr[1]); load_dr2(dbregs->dr[2]); load_dr3(dbregs->dr[3]); load_dr4(dbregs->dr[4]); load_dr5(dbregs->dr[5]); load_dr6(dbregs->dr[6]); load_dr7(dbregs->dr[7]); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP. */ for (i = 0; i < 4; i++) { if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) return (EINVAL); if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02) return (EINVAL); } pcb = td->td_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { /* dr0 is enabled */ if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { /* dr1 is enabled */ if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { /* dr2 is enabled */ if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { /* dr3 is enabled */ if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) return (EINVAL); } pcb->pcb_dr0 = dbregs->dr[0]; pcb->pcb_dr1 = dbregs->dr[1]; pcb->pcb_dr2 = dbregs->dr[2]; pcb->pcb_dr3 = dbregs->dr[3]; pcb->pcb_dr6 = dbregs->dr[6]; pcb->pcb_dr7 = dbregs->dr[7]; pcb->pcb_flags |= PCB_DBREGS; } return (0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(void) { u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */ u_int32_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return 0; } nbp = 0; dr6 = rdr6(); bp = dr6 & 0x0000000f; if (!bp) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return 0; } /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i = 0; i < nbp; i++) { if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { /* * addr[i] is in user space */ return nbp; } } /* * None of the breakpoints are in user space. */ return 0; } #ifdef KDB /* * Provide inb() and outb() as functions. They are normally only * available as macros calling inlined functions, thus cannot be * called from the debugger. * * The actual code is stolen from , and de-inlined. */ #undef inb #undef outb /* silence compiler warnings */ u_char inb(u_int); void outb(u_int, u_char); u_char inb(u_int port) { u_char data; /* * We use %%dx and not %1 here because i/o is done at %dx and not at * %edx, while gcc generates inferior code (movw instead of movl) * if we tell it to load (u_short) port. */ __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); return (data); } void outb(u_int port, u_char data) { u_char al; /* * Use an unnecessary assignment to help gcc's register allocator. * This make a large difference for gcc-1.40 and a tiny difference * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for * best results. gcc-2.6.0 can't handle this. */ al = data; __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); } #endif /* KDB */ Index: head/sys/powerpc/aim/machdep.c =================================================================== --- head/sys/powerpc/aim/machdep.c (revision 173360) +++ head/sys/powerpc/aim/machdep.c (revision 173361) @@ -1,990 +1,990 @@ /*- * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (C) 2001 Benno Rice * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * $NetBSD: machdep.c,v 1.74.2.1 2000/11/01 16:13:48 tv Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_kstack_pages.h" #include "opt_msgbuf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB extern vm_offset_t ksym_start, ksym_end; #endif int cold = 1; struct pcpu __pcpu[MAXCPU]; struct trapframe frame0; vm_offset_t kstack0; vm_offset_t kstack0_phys; char machine[] = "powerpc"; SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, ""); static int cacheline_size = CACHELINESIZE; SYSCTL_INT(_machdep, CPU_CACHELINE, cacheline_size, CTLFLAG_RD, &cacheline_size, 0, ""); static void cpu_startup(void *); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) void powerpc_init(u_int, u_int, u_int, void *); int save_ofw_mapping(void); int restore_ofw_mapping(void); void install_extint(void (*)(void)); int setfault(faultbuf); /* defined in locore.S */ static int grab_mcontext(struct thread *, mcontext_t *, int); void asm_panic(char *); long Maxmem = 0; long realmem = 0; struct pmap ofw_pmap; extern int ofmsr; struct bat battable[16]; struct kva_md_info kmi; void setPQL2(int *const size, int *const ways); void setPQL2(int *const size, int *const ways) { return; } static void powerpc_ofw_shutdown(void *junk, int howto) { if (howto & RB_HALT) { OF_halt(); } OF_reboot(); } static void cpu_startup(void *dummy) { /* * Good {morning,afternoon,evening,night}. */ cpu_setup(PCPU_GET(cpuid)); /* startrtclock(); */ #ifdef PERFMON perfmon_init(); #endif printf("real memory = %ld (%ld MB)\n", ptoa(physmem), ptoa(physmem) / 1048576); realmem = physmem; /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { int size1 = phys_avail[indx + 1] - phys_avail[indx]; printf("0x%08x - 0x%08x, %d bytes (%d pages)\n", phys_avail[indx], phys_avail[indx + 1] - 1, size1, size1 / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ld (%ld MB)\n", ptoa(cnt.v_free_count), ptoa(cnt.v_free_count) / 1048576); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); EVENTHANDLER_REGISTER(shutdown_final, powerpc_ofw_shutdown, 0, SHUTDOWN_PRI_LAST); #ifdef SMP /* * OK, enough kmem_alloc/malloc state should be up, lets get on with it! */ mp_start(); /* fire up the secondaries */ mp_announce(); #endif /* SMP */ } extern char kernel_text[], _end[]; extern void *trapcode, *trapsize; extern void *alitrap, *alisize; extern void *dsitrap, *dsisize; extern void *decrint, *decrsize; extern void *extint, *extsize; extern void *dblow, *dbsize; extern void *vectrap, *vectrapsize; void powerpc_init(u_int startkernel, u_int endkernel, u_int basekernel, void *mdp) { struct pcpu *pc; vm_offset_t end, off; void *kmdp; char *env; end = 0; kmdp = NULL; /* * Parse metadata if present and fetch parameters. Must be done * before console is inited so cninit gets the right value of * boothowto. */ if (mdp != NULL) { preload_metadata = mdp; kmdp = preload_search_by_type("elf kernel"); if (kmdp != NULL) { boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t); #ifdef DDB ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); #endif } } /* * Init params/tunables that can be overridden by the loader */ init_param1(); /* * Start initializing proc0 and thread0. */ - proc_linkup(&proc0, &thread0); + proc_linkup0(&proc0, &thread0); thread0.td_frame = &frame0; /* * Set up per-cpu data. */ pc = &__pcpu[0]; pcpu_init(pc, 0, sizeof(struct pcpu)); pc->pc_curthread = &thread0; pc->pc_curpcb = thread0.td_pcb; pc->pc_cpuid = 0; __asm __volatile("mtsprg 0, %0" :: "r"(pc)); mutex_init(); /* * Initialize the console before printing anything. */ cninit(); /* * Complain if there is no metadata. */ if (mdp == NULL || kmdp == NULL) { printf("powerpc_init: no loader metadata.\n"); } kdb_init(); kobj_machdep_init(); /* * XXX: Initialize the interrupt tables. * Disable translation in case the vector area * hasn't been mapped (G5) */ mtmsr(mfmsr() & ~(PSL_IR | PSL_DR)); isync(); bcopy(&trapcode, (void *)EXC_RST, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_MCHK, (size_t)&trapsize); bcopy(&dsitrap, (void *)EXC_DSI, (size_t)&dsisize); bcopy(&trapcode, (void *)EXC_ISI, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_EXI, (size_t)&trapsize); bcopy(&alitrap, (void *)EXC_ALI, (size_t)&alisize); bcopy(&trapcode, (void *)EXC_PGM, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_FPU, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_DECR, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_SC, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_TRC, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_FPA, (size_t)&trapsize); bcopy(&vectrap, (void *)EXC_VEC, (size_t)&vectrapsize); bcopy(&trapcode, (void *)EXC_VECAST, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_THRM, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_BPT, (size_t)&trapsize); #ifdef KDB bcopy(&dblow, (void *)EXC_RST, (size_t)&dbsize); bcopy(&dblow, (void *)EXC_MCHK, (size_t)&dbsize); bcopy(&dblow, (void *)EXC_PGM, (size_t)&dbsize); bcopy(&dblow, (void *)EXC_TRC, (size_t)&dbsize); bcopy(&dblow, (void *)EXC_BPT, (size_t)&dbsize); #endif __syncicache(EXC_RSVD, EXC_LAST - EXC_RSVD); /* * Make sure translation has been enabled */ mtmsr(mfmsr() | PSL_IR|PSL_DR|PSL_ME|PSL_RI); isync(); /* * Initialise virtual memory. */ pmap_mmu_install(MMU_TYPE_OEA, 0); /* XXX temporary */ pmap_bootstrap(startkernel, endkernel); /* * Initialize params/tunables that are derived from memsize */ init_param2(physmem); /* * Grab booted kernel's name */ env = getenv("kernelname"); if (env != NULL) { strlcpy(kernelname, env, sizeof(kernelname)); freeenv(env); } /* * Finish setting up thread0. */ thread0.td_kstack = kstack0; thread0.td_pcb = (struct pcb *) (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; /* * Map and initialise the message buffer. */ for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, msgbuf_phys + off); msgbufinit(msgbufp, MSGBUF_SIZE); #ifdef KDB if (boothowto & RB_KDB) kdb_enter("Boot flags requested debugger"); #endif } void bzero(void *buf, size_t len) { caddr_t p; p = buf; while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) { *p++ = 0; len--; } while (len >= sizeof(u_long) * 8) { *(u_long*) p = 0; *((u_long*) p + 1) = 0; *((u_long*) p + 2) = 0; *((u_long*) p + 3) = 0; len -= sizeof(u_long) * 8; *((u_long*) p + 4) = 0; *((u_long*) p + 5) = 0; *((u_long*) p + 6) = 0; *((u_long*) p + 7) = 0; p += sizeof(u_long) * 8; } while (len >= sizeof(u_long)) { *(u_long*) p = 0; len -= sizeof(u_long); p += sizeof(u_long); } while (len) { *p++ = 0; len--; } } void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct trapframe *tf; struct sigframe *sfp; struct sigacts *psp; struct sigframe sf; struct thread *td; struct proc *p; int oonstack, rndfsize; int sig; int code; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; code = ksi->ksi_code; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); tf = td->td_frame; oonstack = sigonstack(tf->fixreg[1]); rndfsize = ((sizeof(sf) + 15) / 16) * 16; CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm, catcher, sig); /* * Save user context */ memset(&sf, 0, sizeof(sf)); grab_mcontext(td, &sf.sf_uc.uc_mcontext, 0); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; /* * Allocate and validate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - rndfsize); } else { sfp = (struct sigframe *)(tf->fixreg[1] - rndfsize); } /* * Translate the signal if appropriate (Linux emu ?) */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* * Save the floating-point state, if necessary, then copy it. */ /* XXX */ /* * Set up the registers to return to sigcode. * * r1/sp - sigframe ptr * lr - sig function, dispatched to by blrl in trampoline * r3 - sig number * r4 - SIGINFO ? &siginfo : exception code * r5 - user context * srr0 - trampoline function addr */ tf->lr = (register_t)catcher; tf->fixreg[1] = (register_t)sfp; tf->fixreg[FIRSTARG] = sig; tf->fixreg[FIRSTARG+2] = (register_t)&sfp->sf_uc; if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* * Signal handler installed with SA_SIGINFO. */ tf->fixreg[FIRSTARG+1] = (register_t)&sfp->sf_si; /* * Fill siginfo structure. */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; sf.sf_si.si_addr = (void *) ((tf->exc == EXC_DSI) ? tf->dar : tf->srr0); } else { /* Old FreeBSD-style arguments. */ tf->fixreg[FIRSTARG+1] = code; tf->fixreg[FIRSTARG+3] = (tf->exc == EXC_DSI) ? tf->dar : tf->srr0; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); tf->srr0 = (register_t)(PS_STRINGS - *(p->p_sysent->sv_szsigcode)); /* * copy the frame out to userland. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { /* * Process has trashed its stack. Kill it. */ CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp); PROC_LOCK(p); sigexit(td, SIGILL); } CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->srr0, tf->fixreg[1]); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } int sigreturn(struct thread *td, struct sigreturn_args *uap) { struct proc *p; ucontext_t uc; int error; CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp); if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) { CTR1(KTR_SIG, "sigreturn: efault td=%p", td); return (EFAULT); } error = set_mcontext(td, &uc.uc_mcontext); if (error != 0) return (error); p = td->td_proc; PROC_LOCK(p); td->td_sigmask = uc.uc_sigmask; SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x", td, uc.uc_mcontext.mc_srr0, uc.uc_mcontext.mc_gpr[1]); return (EJUSTRETURN); } #ifdef COMPAT_FREEBSD4 int freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) { return sigreturn(td, (struct sigreturn_args *)uap); } #endif /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_lr = tf->srr0; pcb->pcb_sp = tf->fixreg[1]; } /* * get_mcontext/sendsig helper routine that doesn't touch the * proc lock */ static int grab_mcontext(struct thread *td, mcontext_t *mcp, int flags) { struct pcb *pcb; pcb = td->td_pcb; memset(mcp, 0, sizeof(mcontext_t)); mcp->mc_vers = _MC_VERSION; mcp->mc_flags = 0; memcpy(&mcp->mc_frame, td->td_frame, sizeof(struct trapframe)); if (flags & GET_MC_CLEAR_RET) { mcp->mc_gpr[3] = 0; mcp->mc_gpr[4] = 0; } /* * This assumes that floating-point context is *not* lazy, * so if the thread has used FP there would have been a * FP-unavailable exception that would have set things up * correctly. */ if (pcb->pcb_flags & PCB_FPU) { KASSERT(td == curthread, ("get_mcontext: fp save not curthread")); critical_enter(); save_fpu(td); critical_exit(); mcp->mc_flags |= _MC_FP_VALID; memcpy(&mcp->mc_fpscr, &pcb->pcb_fpu.fpscr, sizeof(double)); memcpy(mcp->mc_fpreg, pcb->pcb_fpu.fpr, 32*sizeof(double)); } /* XXX Altivec context ? */ mcp->mc_len = sizeof(*mcp); return (0); } int get_mcontext(struct thread *td, mcontext_t *mcp, int flags) { int error; error = grab_mcontext(td, mcp, flags); if (error == 0) { PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(td->td_frame->fixreg[1]); PROC_UNLOCK(curthread->td_proc); } return (error); } int set_mcontext(struct thread *td, const mcontext_t *mcp) { struct pcb *pcb; struct trapframe *tf; pcb = td->td_pcb; tf = td->td_frame; if (mcp->mc_vers != _MC_VERSION || mcp->mc_len != sizeof(*mcp)) return (EINVAL); /* * Don't let the user set privileged MSR bits */ if ((mcp->mc_srr1 & PSL_USERSTATIC) != (tf->srr1 & PSL_USERSTATIC)) { return (EINVAL); } memcpy(tf, mcp->mc_frame, sizeof(mcp->mc_frame)); if (mcp->mc_flags & _MC_FP_VALID) { if ((pcb->pcb_flags & PCB_FPU) != PCB_FPU) { critical_enter(); enable_fpu(td); critical_exit(); } memcpy(&pcb->pcb_fpu.fpscr, &mcp->mc_fpscr, sizeof(double)); memcpy(pcb->pcb_fpu.fpr, mcp->mc_fpreg, 32*sizeof(double)); } /* XXX Altivec context? */ return (0); } void cpu_boot(int howto) { } void cpu_initclocks(void) { decr_init(); } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { return (ENXIO); } /* * Shutdown the CPU as much as possible. */ void cpu_halt(void) { OF_exit(); } void cpu_idle(void) { /* TODO: Insert code to halt (until next interrupt) */ #ifdef INVARIANTS if ((mfmsr() & PSL_EE) != PSL_EE) { struct thread *td = curthread; printf("td msr %x\n", td->td_md.md_saved_msr); panic("ints disabled in idleproc!"); } #endif } /* * Set set up registers on exec. */ void exec_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings) { struct trapframe *tf; struct ps_strings arginfo; tf = trapframe(td); bzero(tf, sizeof *tf); tf->fixreg[1] = -roundup(-stack + 8, 16); /* * XXX Machine-independent code has already copied arguments and * XXX environment to userland. Get them back here. */ (void)copyin((char *)PS_STRINGS, &arginfo, sizeof(arginfo)); /* * Set up arguments for _start(): * _start(argc, argv, envp, obj, cleanup, ps_strings); * * Notes: * - obj and cleanup are the auxilliary and termination * vectors. They are fixed up by ld.elf_so. * - ps_strings is a NetBSD extention, and will be * ignored by executables which are strictly * compliant with the SVR4 ABI. * * XXX We have to set both regs and retval here due to different * XXX calling convention in trap.c and init_main.c. */ /* * XXX PG: these get overwritten in the syscall return code. * execve() should return EJUSTRETURN, like it does on NetBSD. * Emulate by setting the syscall return value cells. The * registers still have to be set for init's fork trampoline. */ td->td_retval[0] = arginfo.ps_nargvstr; td->td_retval[1] = (register_t)arginfo.ps_argvstr; tf->fixreg[3] = arginfo.ps_nargvstr; tf->fixreg[4] = (register_t)arginfo.ps_argvstr; tf->fixreg[5] = (register_t)arginfo.ps_envstr; tf->fixreg[6] = 0; /* auxillary vector */ tf->fixreg[7] = 0; /* termination vector */ tf->fixreg[8] = (register_t)PS_STRINGS; /* NetBSD extension */ tf->srr0 = entry; tf->srr1 = PSL_MBO | PSL_USERSET | PSL_FE_DFLT; td->td_pcb->pcb_flags = 0; } int fill_regs(struct thread *td, struct reg *regs) { struct trapframe *tf; tf = td->td_frame; memcpy(regs, tf, sizeof(struct reg)); return (0); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { /* No debug registers on PowerPC */ return (ENOSYS); } int fill_fpregs(struct thread *td, struct fpreg *fpregs) { struct pcb *pcb; pcb = td->td_pcb; if ((pcb->pcb_flags & PCB_FPU) == 0) memset(fpregs, 0, sizeof(struct fpreg)); else memcpy(fpregs, &pcb->pcb_fpu, sizeof(struct fpreg)); return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *tf; tf = td->td_frame; memcpy(tf, regs, sizeof(struct reg)); return (0); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { /* No debug registers on PowerPC */ return (ENOSYS); } int set_fpregs(struct thread *td, struct fpreg *fpregs) { struct pcb *pcb; pcb = td->td_pcb; if ((pcb->pcb_flags & PCB_FPU) == 0) enable_fpu(td); memcpy(&pcb->pcb_fpu, fpregs, sizeof(struct fpreg)); return (0); } int ptrace_set_pc(struct thread *td, unsigned long addr) { struct trapframe *tf; tf = td->td_frame; tf->srr0 = (register_t)addr; return (0); } int ptrace_single_step(struct thread *td) { struct trapframe *tf; tf = td->td_frame; tf->srr1 |= PSL_SE; return (0); } int ptrace_clear_single_step(struct thread *td) { struct trapframe *tf; tf = td->td_frame; tf->srr1 &= ~PSL_SE; return (0); } /* * Initialise a struct pcpu. */ void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t sz) { } void spinlock_enter(void) { struct thread *td; td = curthread; if (td->td_md.md_spinlock_count == 0) td->td_md.md_saved_msr = intr_disable(); td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; td = curthread; critical_exit(); td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) intr_restore(td->td_md.md_saved_msr); } /* * kcopy(const void *src, void *dst, size_t len); * * Copy len bytes from src to dst, aborting if we encounter a fatal * page fault. * * kcopy() _must_ save and restore the old fault handler since it is * called by uiomove(), which may be in the path of servicing a non-fatal * page fault. */ int kcopy(const void *src, void *dst, size_t len) { struct thread *td; faultbuf env, *oldfault; int rv; td = PCPU_GET(curthread); oldfault = td->td_pcb->pcb_onfault; if ((rv = setfault(env)) != 0) { td->td_pcb->pcb_onfault = oldfault; return rv; } memcpy(dst, src, len); td->td_pcb->pcb_onfault = oldfault; return (0); } void asm_panic(char *pstr) { panic(pstr); } int db_trap_glue(struct trapframe *); /* Called from trap_subr.S */ int db_trap_glue(struct trapframe *frame) { if (!(frame->srr1 & PSL_PR) && (frame->exc == EXC_TRC || frame->exc == EXC_RUNMODETRC || (frame->exc == EXC_PGM && (frame->srr1 & 0x20000)) || frame->exc == EXC_BPT || frame->exc == EXC_DSI)) { int type = frame->exc; if (type == EXC_PGM && (frame->srr1 & 0x20000)) { type = T_BREAKPOINT; } return (kdb_trap(type, 0, frame)); } return (0); } Index: head/sys/powerpc/powerpc/machdep.c =================================================================== --- head/sys/powerpc/powerpc/machdep.c (revision 173360) +++ head/sys/powerpc/powerpc/machdep.c (revision 173361) @@ -1,990 +1,990 @@ /*- * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (C) 2001 Benno Rice * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * $NetBSD: machdep.c,v 1.74.2.1 2000/11/01 16:13:48 tv Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_kstack_pages.h" #include "opt_msgbuf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB extern vm_offset_t ksym_start, ksym_end; #endif int cold = 1; struct pcpu __pcpu[MAXCPU]; struct trapframe frame0; vm_offset_t kstack0; vm_offset_t kstack0_phys; char machine[] = "powerpc"; SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, ""); static int cacheline_size = CACHELINESIZE; SYSCTL_INT(_machdep, CPU_CACHELINE, cacheline_size, CTLFLAG_RD, &cacheline_size, 0, ""); static void cpu_startup(void *); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) void powerpc_init(u_int, u_int, u_int, void *); int save_ofw_mapping(void); int restore_ofw_mapping(void); void install_extint(void (*)(void)); int setfault(faultbuf); /* defined in locore.S */ static int grab_mcontext(struct thread *, mcontext_t *, int); void asm_panic(char *); long Maxmem = 0; long realmem = 0; struct pmap ofw_pmap; extern int ofmsr; struct bat battable[16]; struct kva_md_info kmi; void setPQL2(int *const size, int *const ways); void setPQL2(int *const size, int *const ways) { return; } static void powerpc_ofw_shutdown(void *junk, int howto) { if (howto & RB_HALT) { OF_halt(); } OF_reboot(); } static void cpu_startup(void *dummy) { /* * Good {morning,afternoon,evening,night}. */ cpu_setup(PCPU_GET(cpuid)); /* startrtclock(); */ #ifdef PERFMON perfmon_init(); #endif printf("real memory = %ld (%ld MB)\n", ptoa(physmem), ptoa(physmem) / 1048576); realmem = physmem; /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { int size1 = phys_avail[indx + 1] - phys_avail[indx]; printf("0x%08x - 0x%08x, %d bytes (%d pages)\n", phys_avail[indx], phys_avail[indx + 1] - 1, size1, size1 / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ld (%ld MB)\n", ptoa(cnt.v_free_count), ptoa(cnt.v_free_count) / 1048576); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); EVENTHANDLER_REGISTER(shutdown_final, powerpc_ofw_shutdown, 0, SHUTDOWN_PRI_LAST); #ifdef SMP /* * OK, enough kmem_alloc/malloc state should be up, lets get on with it! */ mp_start(); /* fire up the secondaries */ mp_announce(); #endif /* SMP */ } extern char kernel_text[], _end[]; extern void *trapcode, *trapsize; extern void *alitrap, *alisize; extern void *dsitrap, *dsisize; extern void *decrint, *decrsize; extern void *extint, *extsize; extern void *dblow, *dbsize; extern void *vectrap, *vectrapsize; void powerpc_init(u_int startkernel, u_int endkernel, u_int basekernel, void *mdp) { struct pcpu *pc; vm_offset_t end, off; void *kmdp; char *env; end = 0; kmdp = NULL; /* * Parse metadata if present and fetch parameters. Must be done * before console is inited so cninit gets the right value of * boothowto. */ if (mdp != NULL) { preload_metadata = mdp; kmdp = preload_search_by_type("elf kernel"); if (kmdp != NULL) { boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t); #ifdef DDB ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); #endif } } /* * Init params/tunables that can be overridden by the loader */ init_param1(); /* * Start initializing proc0 and thread0. */ - proc_linkup(&proc0, &thread0); + proc_linkup0(&proc0, &thread0); thread0.td_frame = &frame0; /* * Set up per-cpu data. */ pc = &__pcpu[0]; pcpu_init(pc, 0, sizeof(struct pcpu)); pc->pc_curthread = &thread0; pc->pc_curpcb = thread0.td_pcb; pc->pc_cpuid = 0; __asm __volatile("mtsprg 0, %0" :: "r"(pc)); mutex_init(); /* * Initialize the console before printing anything. */ cninit(); /* * Complain if there is no metadata. */ if (mdp == NULL || kmdp == NULL) { printf("powerpc_init: no loader metadata.\n"); } kdb_init(); kobj_machdep_init(); /* * XXX: Initialize the interrupt tables. * Disable translation in case the vector area * hasn't been mapped (G5) */ mtmsr(mfmsr() & ~(PSL_IR | PSL_DR)); isync(); bcopy(&trapcode, (void *)EXC_RST, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_MCHK, (size_t)&trapsize); bcopy(&dsitrap, (void *)EXC_DSI, (size_t)&dsisize); bcopy(&trapcode, (void *)EXC_ISI, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_EXI, (size_t)&trapsize); bcopy(&alitrap, (void *)EXC_ALI, (size_t)&alisize); bcopy(&trapcode, (void *)EXC_PGM, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_FPU, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_DECR, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_SC, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_TRC, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_FPA, (size_t)&trapsize); bcopy(&vectrap, (void *)EXC_VEC, (size_t)&vectrapsize); bcopy(&trapcode, (void *)EXC_VECAST, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_THRM, (size_t)&trapsize); bcopy(&trapcode, (void *)EXC_BPT, (size_t)&trapsize); #ifdef KDB bcopy(&dblow, (void *)EXC_RST, (size_t)&dbsize); bcopy(&dblow, (void *)EXC_MCHK, (size_t)&dbsize); bcopy(&dblow, (void *)EXC_PGM, (size_t)&dbsize); bcopy(&dblow, (void *)EXC_TRC, (size_t)&dbsize); bcopy(&dblow, (void *)EXC_BPT, (size_t)&dbsize); #endif __syncicache(EXC_RSVD, EXC_LAST - EXC_RSVD); /* * Make sure translation has been enabled */ mtmsr(mfmsr() | PSL_IR|PSL_DR|PSL_ME|PSL_RI); isync(); /* * Initialise virtual memory. */ pmap_mmu_install(MMU_TYPE_OEA, 0); /* XXX temporary */ pmap_bootstrap(startkernel, endkernel); /* * Initialize params/tunables that are derived from memsize */ init_param2(physmem); /* * Grab booted kernel's name */ env = getenv("kernelname"); if (env != NULL) { strlcpy(kernelname, env, sizeof(kernelname)); freeenv(env); } /* * Finish setting up thread0. */ thread0.td_kstack = kstack0; thread0.td_pcb = (struct pcb *) (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; /* * Map and initialise the message buffer. */ for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, msgbuf_phys + off); msgbufinit(msgbufp, MSGBUF_SIZE); #ifdef KDB if (boothowto & RB_KDB) kdb_enter("Boot flags requested debugger"); #endif } void bzero(void *buf, size_t len) { caddr_t p; p = buf; while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) { *p++ = 0; len--; } while (len >= sizeof(u_long) * 8) { *(u_long*) p = 0; *((u_long*) p + 1) = 0; *((u_long*) p + 2) = 0; *((u_long*) p + 3) = 0; len -= sizeof(u_long) * 8; *((u_long*) p + 4) = 0; *((u_long*) p + 5) = 0; *((u_long*) p + 6) = 0; *((u_long*) p + 7) = 0; p += sizeof(u_long) * 8; } while (len >= sizeof(u_long)) { *(u_long*) p = 0; len -= sizeof(u_long); p += sizeof(u_long); } while (len) { *p++ = 0; len--; } } void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct trapframe *tf; struct sigframe *sfp; struct sigacts *psp; struct sigframe sf; struct thread *td; struct proc *p; int oonstack, rndfsize; int sig; int code; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; code = ksi->ksi_code; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); tf = td->td_frame; oonstack = sigonstack(tf->fixreg[1]); rndfsize = ((sizeof(sf) + 15) / 16) * 16; CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm, catcher, sig); /* * Save user context */ memset(&sf, 0, sizeof(sf)); grab_mcontext(td, &sf.sf_uc.uc_mcontext, 0); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; /* * Allocate and validate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - rndfsize); } else { sfp = (struct sigframe *)(tf->fixreg[1] - rndfsize); } /* * Translate the signal if appropriate (Linux emu ?) */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* * Save the floating-point state, if necessary, then copy it. */ /* XXX */ /* * Set up the registers to return to sigcode. * * r1/sp - sigframe ptr * lr - sig function, dispatched to by blrl in trampoline * r3 - sig number * r4 - SIGINFO ? &siginfo : exception code * r5 - user context * srr0 - trampoline function addr */ tf->lr = (register_t)catcher; tf->fixreg[1] = (register_t)sfp; tf->fixreg[FIRSTARG] = sig; tf->fixreg[FIRSTARG+2] = (register_t)&sfp->sf_uc; if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* * Signal handler installed with SA_SIGINFO. */ tf->fixreg[FIRSTARG+1] = (register_t)&sfp->sf_si; /* * Fill siginfo structure. */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; sf.sf_si.si_addr = (void *) ((tf->exc == EXC_DSI) ? tf->dar : tf->srr0); } else { /* Old FreeBSD-style arguments. */ tf->fixreg[FIRSTARG+1] = code; tf->fixreg[FIRSTARG+3] = (tf->exc == EXC_DSI) ? tf->dar : tf->srr0; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); tf->srr0 = (register_t)(PS_STRINGS - *(p->p_sysent->sv_szsigcode)); /* * copy the frame out to userland. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { /* * Process has trashed its stack. Kill it. */ CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp); PROC_LOCK(p); sigexit(td, SIGILL); } CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->srr0, tf->fixreg[1]); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } int sigreturn(struct thread *td, struct sigreturn_args *uap) { struct proc *p; ucontext_t uc; int error; CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp); if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) { CTR1(KTR_SIG, "sigreturn: efault td=%p", td); return (EFAULT); } error = set_mcontext(td, &uc.uc_mcontext); if (error != 0) return (error); p = td->td_proc; PROC_LOCK(p); td->td_sigmask = uc.uc_sigmask; SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x", td, uc.uc_mcontext.mc_srr0, uc.uc_mcontext.mc_gpr[1]); return (EJUSTRETURN); } #ifdef COMPAT_FREEBSD4 int freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) { return sigreturn(td, (struct sigreturn_args *)uap); } #endif /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_lr = tf->srr0; pcb->pcb_sp = tf->fixreg[1]; } /* * get_mcontext/sendsig helper routine that doesn't touch the * proc lock */ static int grab_mcontext(struct thread *td, mcontext_t *mcp, int flags) { struct pcb *pcb; pcb = td->td_pcb; memset(mcp, 0, sizeof(mcontext_t)); mcp->mc_vers = _MC_VERSION; mcp->mc_flags = 0; memcpy(&mcp->mc_frame, td->td_frame, sizeof(struct trapframe)); if (flags & GET_MC_CLEAR_RET) { mcp->mc_gpr[3] = 0; mcp->mc_gpr[4] = 0; } /* * This assumes that floating-point context is *not* lazy, * so if the thread has used FP there would have been a * FP-unavailable exception that would have set things up * correctly. */ if (pcb->pcb_flags & PCB_FPU) { KASSERT(td == curthread, ("get_mcontext: fp save not curthread")); critical_enter(); save_fpu(td); critical_exit(); mcp->mc_flags |= _MC_FP_VALID; memcpy(&mcp->mc_fpscr, &pcb->pcb_fpu.fpscr, sizeof(double)); memcpy(mcp->mc_fpreg, pcb->pcb_fpu.fpr, 32*sizeof(double)); } /* XXX Altivec context ? */ mcp->mc_len = sizeof(*mcp); return (0); } int get_mcontext(struct thread *td, mcontext_t *mcp, int flags) { int error; error = grab_mcontext(td, mcp, flags); if (error == 0) { PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(td->td_frame->fixreg[1]); PROC_UNLOCK(curthread->td_proc); } return (error); } int set_mcontext(struct thread *td, const mcontext_t *mcp) { struct pcb *pcb; struct trapframe *tf; pcb = td->td_pcb; tf = td->td_frame; if (mcp->mc_vers != _MC_VERSION || mcp->mc_len != sizeof(*mcp)) return (EINVAL); /* * Don't let the user set privileged MSR bits */ if ((mcp->mc_srr1 & PSL_USERSTATIC) != (tf->srr1 & PSL_USERSTATIC)) { return (EINVAL); } memcpy(tf, mcp->mc_frame, sizeof(mcp->mc_frame)); if (mcp->mc_flags & _MC_FP_VALID) { if ((pcb->pcb_flags & PCB_FPU) != PCB_FPU) { critical_enter(); enable_fpu(td); critical_exit(); } memcpy(&pcb->pcb_fpu.fpscr, &mcp->mc_fpscr, sizeof(double)); memcpy(pcb->pcb_fpu.fpr, mcp->mc_fpreg, 32*sizeof(double)); } /* XXX Altivec context? */ return (0); } void cpu_boot(int howto) { } void cpu_initclocks(void) { decr_init(); } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { return (ENXIO); } /* * Shutdown the CPU as much as possible. */ void cpu_halt(void) { OF_exit(); } void cpu_idle(void) { /* TODO: Insert code to halt (until next interrupt) */ #ifdef INVARIANTS if ((mfmsr() & PSL_EE) != PSL_EE) { struct thread *td = curthread; printf("td msr %x\n", td->td_md.md_saved_msr); panic("ints disabled in idleproc!"); } #endif } /* * Set set up registers on exec. */ void exec_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings) { struct trapframe *tf; struct ps_strings arginfo; tf = trapframe(td); bzero(tf, sizeof *tf); tf->fixreg[1] = -roundup(-stack + 8, 16); /* * XXX Machine-independent code has already copied arguments and * XXX environment to userland. Get them back here. */ (void)copyin((char *)PS_STRINGS, &arginfo, sizeof(arginfo)); /* * Set up arguments for _start(): * _start(argc, argv, envp, obj, cleanup, ps_strings); * * Notes: * - obj and cleanup are the auxilliary and termination * vectors. They are fixed up by ld.elf_so. * - ps_strings is a NetBSD extention, and will be * ignored by executables which are strictly * compliant with the SVR4 ABI. * * XXX We have to set both regs and retval here due to different * XXX calling convention in trap.c and init_main.c. */ /* * XXX PG: these get overwritten in the syscall return code. * execve() should return EJUSTRETURN, like it does on NetBSD. * Emulate by setting the syscall return value cells. The * registers still have to be set for init's fork trampoline. */ td->td_retval[0] = arginfo.ps_nargvstr; td->td_retval[1] = (register_t)arginfo.ps_argvstr; tf->fixreg[3] = arginfo.ps_nargvstr; tf->fixreg[4] = (register_t)arginfo.ps_argvstr; tf->fixreg[5] = (register_t)arginfo.ps_envstr; tf->fixreg[6] = 0; /* auxillary vector */ tf->fixreg[7] = 0; /* termination vector */ tf->fixreg[8] = (register_t)PS_STRINGS; /* NetBSD extension */ tf->srr0 = entry; tf->srr1 = PSL_MBO | PSL_USERSET | PSL_FE_DFLT; td->td_pcb->pcb_flags = 0; } int fill_regs(struct thread *td, struct reg *regs) { struct trapframe *tf; tf = td->td_frame; memcpy(regs, tf, sizeof(struct reg)); return (0); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { /* No debug registers on PowerPC */ return (ENOSYS); } int fill_fpregs(struct thread *td, struct fpreg *fpregs) { struct pcb *pcb; pcb = td->td_pcb; if ((pcb->pcb_flags & PCB_FPU) == 0) memset(fpregs, 0, sizeof(struct fpreg)); else memcpy(fpregs, &pcb->pcb_fpu, sizeof(struct fpreg)); return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *tf; tf = td->td_frame; memcpy(tf, regs, sizeof(struct reg)); return (0); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { /* No debug registers on PowerPC */ return (ENOSYS); } int set_fpregs(struct thread *td, struct fpreg *fpregs) { struct pcb *pcb; pcb = td->td_pcb; if ((pcb->pcb_flags & PCB_FPU) == 0) enable_fpu(td); memcpy(&pcb->pcb_fpu, fpregs, sizeof(struct fpreg)); return (0); } int ptrace_set_pc(struct thread *td, unsigned long addr) { struct trapframe *tf; tf = td->td_frame; tf->srr0 = (register_t)addr; return (0); } int ptrace_single_step(struct thread *td) { struct trapframe *tf; tf = td->td_frame; tf->srr1 |= PSL_SE; return (0); } int ptrace_clear_single_step(struct thread *td) { struct trapframe *tf; tf = td->td_frame; tf->srr1 &= ~PSL_SE; return (0); } /* * Initialise a struct pcpu. */ void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t sz) { } void spinlock_enter(void) { struct thread *td; td = curthread; if (td->td_md.md_spinlock_count == 0) td->td_md.md_saved_msr = intr_disable(); td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; td = curthread; critical_exit(); td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) intr_restore(td->td_md.md_saved_msr); } /* * kcopy(const void *src, void *dst, size_t len); * * Copy len bytes from src to dst, aborting if we encounter a fatal * page fault. * * kcopy() _must_ save and restore the old fault handler since it is * called by uiomove(), which may be in the path of servicing a non-fatal * page fault. */ int kcopy(const void *src, void *dst, size_t len) { struct thread *td; faultbuf env, *oldfault; int rv; td = PCPU_GET(curthread); oldfault = td->td_pcb->pcb_onfault; if ((rv = setfault(env)) != 0) { td->td_pcb->pcb_onfault = oldfault; return rv; } memcpy(dst, src, len); td->td_pcb->pcb_onfault = oldfault; return (0); } void asm_panic(char *pstr) { panic(pstr); } int db_trap_glue(struct trapframe *); /* Called from trap_subr.S */ int db_trap_glue(struct trapframe *frame) { if (!(frame->srr1 & PSL_PR) && (frame->exc == EXC_TRC || frame->exc == EXC_RUNMODETRC || (frame->exc == EXC_PGM && (frame->srr1 & 0x20000)) || frame->exc == EXC_BPT || frame->exc == EXC_DSI)) { int type = frame->exc; if (type == EXC_PGM && (frame->srr1 & 0x20000)) { type = T_BREAKPOINT; } return (kdb_trap(type, 0, frame)); } return (0); } Index: head/sys/powerpc/powerpc/pmap_dispatch.c =================================================================== --- head/sys/powerpc/powerpc/pmap_dispatch.c (revision 173360) +++ head/sys/powerpc/powerpc/pmap_dispatch.c (revision 173361) @@ -1,382 +1,383 @@ /*- * Copyright (c) 2005 Peter Grehan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); /* * Dispatch MI pmap calls to the appropriate MMU implementation * through a previously registered kernel object. * * Before pmap_bootstrap() can be called, a CPU module must have * called pmap_mmu_install(). This may be called multiple times: * the highest priority call will be installed as the default * MMU handler when pmap_bootstrap() is called. * * It is required that kobj_machdep_init() be called before * pmap_bootstrap() to allow the kobj subsystem to initialise. This * in turn requires that mutex_init() has been called. */ #include #include #include #include #include #include #include #include #include "mmu_if.h" static mmu_def_t *mmu_def_impl; static mmu_t mmu_obj; static struct mmu_kobj mmu_kernel_obj; static struct kobj_ops mmu_kernel_kops; /* * pmap globals */ struct pmap kernel_pmap_store; struct msgbuf *msgbufp; vm_offset_t msgbuf_phys; vm_offset_t kernel_vm_end; vm_offset_t phys_avail[PHYS_AVAIL_SZ]; vm_offset_t virtual_avail; vm_offset_t virtual_end; int pmap_bootstrapped; void pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) { MMU_CHANGE_WIRING(mmu_obj, pmap, va, wired); } void pmap_clear_modify(vm_page_t m) { MMU_CLEAR_MODIFY(mmu_obj, m); } void pmap_clear_reference(vm_page_t m) { MMU_CLEAR_REFERENCE(mmu_obj, m); } void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { MMU_COPY(mmu_obj, dst_pmap, src_pmap, dst_addr, len, src_addr); } void pmap_copy_page(vm_page_t src, vm_page_t dst) { MMU_COPY_PAGE(mmu_obj, src, dst); } void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t p, vm_prot_t prot, boolean_t wired) { MMU_ENTER(mmu_obj, pmap, va, p, prot, wired); } void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { MMU_ENTER_OBJECT(mmu_obj, pmap, start, end, m_start, prot); } void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { MMU_ENTER_QUICK(mmu_obj, pmap, va, m, prot); } vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { return (MMU_EXTRACT(mmu_obj, pmap, va)); } vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { return (MMU_EXTRACT_AND_HOLD(mmu_obj, pmap, va, prot)); } void pmap_growkernel(vm_offset_t va) { MMU_GROWKERNEL(mmu_obj, va); } void pmap_init(void) { MMU_INIT(mmu_obj); } boolean_t pmap_is_modified(vm_page_t m) { return (MMU_IS_MODIFIED(mmu_obj, m)); } boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t va) { return (MMU_IS_PREFAULTABLE(mmu_obj, pmap, va)); } boolean_t pmap_ts_referenced(vm_page_t m) { return (MMU_TS_REFERENCED(mmu_obj, m)); } vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return (MMU_MAP(mmu_obj, virt, start, end, prot)); } void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { MMU_OBJECT_INIT_PT(mmu_obj, pmap, addr, object, pindex, size); } boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { return (MMU_PAGE_EXISTS_QUICK(mmu_obj, pmap, m)); } void pmap_page_init(vm_page_t m) { MMU_PAGE_INIT(mmu_obj, m); } -void +int pmap_pinit(pmap_t pmap) { MMU_PINIT(mmu_obj, pmap); + return (1); } void pmap_pinit0(pmap_t pmap) { MMU_PINIT0(mmu_obj, pmap); } void pmap_protect(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_prot_t prot) { MMU_PROTECT(mmu_obj, pmap, start, end, prot); } void pmap_qenter(vm_offset_t start, vm_page_t *m, int count) { MMU_QENTER(mmu_obj, start, m, count); } void pmap_qremove(vm_offset_t start, int count) { MMU_QREMOVE(mmu_obj, start, count); } void pmap_release(pmap_t pmap) { MMU_RELEASE(mmu_obj, pmap); } void pmap_remove(pmap_t pmap, vm_offset_t start, vm_offset_t end) { MMU_REMOVE(mmu_obj, pmap, start, end); } void pmap_remove_all(vm_page_t m) { MMU_REMOVE_ALL(mmu_obj, m); } void pmap_remove_pages(pmap_t pmap) { MMU_REMOVE_PAGES(mmu_obj, pmap); } void pmap_remove_write(vm_page_t m) { MMU_REMOVE_WRITE(mmu_obj, m); } void pmap_zero_page(vm_page_t m) { MMU_ZERO_PAGE(mmu_obj, m); } void pmap_zero_page_area(vm_page_t m, int off, int size) { MMU_ZERO_PAGE_AREA(mmu_obj, m, off, size); } void pmap_zero_page_idle(vm_page_t m) { MMU_ZERO_PAGE_IDLE(mmu_obj, m); } int pmap_mincore(pmap_t pmap, vm_offset_t addr) { return (MMU_MINCORE(mmu_obj, pmap, addr)); } void pmap_activate(struct thread *td) { MMU_ACTIVATE(mmu_obj, td); } void pmap_deactivate(struct thread *td) { MMU_DEACTIVATE(mmu_obj, td); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { return (MMU_ADDR_HINT(mmu_obj, obj, addr, size)); } /* * Routines used in machine-dependent code */ void pmap_bootstrap(vm_offset_t start, vm_offset_t end) { mmu_obj = &mmu_kernel_obj; /* * Take care of compiling the selected class, and * then statically initialise the MMU object */ kobj_class_compile_static(mmu_def_impl, &mmu_kernel_kops); kobj_init((kobj_t)mmu_obj, mmu_def_impl); MMU_BOOTSTRAP(mmu_obj, start, end); } void * pmap_mapdev(vm_offset_t pa, vm_size_t size) { return (MMU_MAPDEV(mmu_obj, pa, size)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { MMU_UNMAPDEV(mmu_obj, va, size); } vm_offset_t pmap_kextract(vm_offset_t va) { return (MMU_KEXTRACT(mmu_obj, va)); } void pmap_kenter(vm_offset_t va, vm_offset_t pa) { MMU_KENTER(mmu_obj, va, pa); } boolean_t pmap_dev_direct_mapped(vm_offset_t pa, vm_size_t size) { return (MMU_DEV_DIRECT_MAPPED(mmu_obj, pa, size)); } boolean_t pmap_page_executable(vm_page_t pg) { return (MMU_PAGE_EXECUTABLE(mmu_obj, pg)); } /* * MMU install routines. Highest priority wins, equal priority also * overrides allowing last-set to win. */ SET_DECLARE(mmu_set, mmu_def_t); boolean_t pmap_mmu_install(char *name, int prio) { mmu_def_t **mmupp, *mmup; static int curr_prio = 0; /* * Try and locate the MMU kobj corresponding to the name */ SET_FOREACH(mmupp, mmu_set) { mmup = *mmupp; if (mmup->name && !strcmp(mmup->name, name) && prio >= curr_prio) { curr_prio = prio; mmu_def_impl = mmup; return (TRUE); } } return (FALSE); } Index: head/sys/sparc64/sparc64/machdep.c =================================================================== --- head/sys/sparc64/sparc64/machdep.c (revision 173360) +++ head/sys/sparc64/sparc64/machdep.c (revision 173361) @@ -1,912 +1,912 @@ /*- * Copyright (c) 2001 Jake Burkholder. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 * from: FreeBSD: src/sys/i386/i386/machdep.c,v 1.477 2001/08/27 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_kstack_pages.h" #include "opt_msgbuf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef int ofw_vec_t(void *); #ifdef DDB extern vm_offset_t ksym_start, ksym_end; #endif struct tlb_entry *kernel_tlbs; int kernel_tlb_slots; int cold = 1; long Maxmem; long realmem; char pcpu0[PCPU_PAGES * PAGE_SIZE]; struct trapframe frame0; vm_offset_t kstack0; vm_paddr_t kstack0_phys; struct kva_md_info kmi; u_long ofw_vec; u_long ofw_tba; /* * Note: timer quality for CPU's is set low to try and prevent them from * being chosen as the primary timecounter. The CPU counters are not * synchronized among the CPU's so in MP machines this causes problems * when calculating the time. With this value the CPU's should only be * chosen as the primary timecounter as a last resort. */ #define UP_TICK_QUALITY 1000 #define MP_TICK_QUALITY -100 static struct timecounter tick_tc; char sparc64_model[32]; static int cpu_use_vis = 1; cpu_block_copy_t *cpu_block_copy; cpu_block_zero_t *cpu_block_zero; static timecounter_get_t tick_get_timecount; void sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec); void sparc64_shutdown_final(void *dummy, int howto); static void cpu_startup(void *); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); CTASSERT((1 << INT_SHIFT) == sizeof(int)); CTASSERT((1 << PTR_SHIFT) == sizeof(char *)); CTASSERT(sizeof(struct reg) == 256); CTASSERT(sizeof(struct fpreg) == 272); CTASSERT(sizeof(struct __mcontext) == 512); CTASSERT((sizeof(struct pcb) & (64 - 1)) == 0); CTASSERT((offsetof(struct pcb, pcb_kfp) & (64 - 1)) == 0); CTASSERT((offsetof(struct pcb, pcb_ufp) & (64 - 1)) == 0); CTASSERT(sizeof(struct pcb) <= ((KSTACK_PAGES * PAGE_SIZE) / 8)); CTASSERT(sizeof(struct pcpu) <= ((PCPU_PAGES * PAGE_SIZE) / 2)); static void cpu_startup(void *arg) { vm_paddr_t physsz; int i; tick_tc.tc_get_timecount = tick_get_timecount; tick_tc.tc_poll_pps = NULL; tick_tc.tc_counter_mask = ~0u; tick_tc.tc_frequency = tick_freq; tick_tc.tc_name = "tick"; tick_tc.tc_quality = UP_TICK_QUALITY; #ifdef SMP /* * We do not know if each CPU's tick counter is synchronized. */ if (cpu_mp_probe()) tick_tc.tc_quality = MP_TICK_QUALITY; #endif tc_init(&tick_tc); physsz = 0; for (i = 0; i < sparc64_nmemreg; i++) physsz += sparc64_memreg[i].mr_size; printf("real memory = %lu (%lu MB)\n", physsz, physsz / (1024 * 1024)); realmem = (long)physsz / PAGE_SIZE; vm_ksubmap_init(&kmi); bufinit(); vm_pager_bufferinit(); EVENTHANDLER_REGISTER(shutdown_final, sparc64_shutdown_final, NULL, SHUTDOWN_PRI_LAST); printf("avail memory = %lu (%lu MB)\n", cnt.v_free_count * PAGE_SIZE, cnt.v_free_count / ((1024 * 1024) / PAGE_SIZE)); if (bootverbose) printf("machine: %s\n", sparc64_model); cpu_identify(rdpr(ver), tick_freq, PCPU_GET(cpuid)); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { struct intr_request *ir; int i; pcpu->pc_irtail = &pcpu->pc_irhead; for (i = 0; i < IR_FREE; i++) { ir = &pcpu->pc_irpool[i]; ir->ir_next = pcpu->pc_irfree; pcpu->pc_irfree = ir; } } void spinlock_enter(void) { struct thread *td; register_t pil; td = curthread; if (td->td_md.md_spinlock_count == 0) { pil = rdpr(pil); wrpr(pil, 0, PIL_TICK); td->td_md.md_saved_pil = pil; } td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; td = curthread; critical_exit(); td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) wrpr(pil, td->td_md.md_saved_pil, 0); } unsigned tick_get_timecount(struct timecounter *tc) { return ((unsigned)rd(tick)); } void sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec) { phandle_t child; phandle_t root; struct pcpu *pc; vm_offset_t end; caddr_t kmdp; u_int clock; char *env; char type[8]; end = 0; kmdp = NULL; /* * Find out what kind of cpu we have first, for anything that changes * behaviour. */ cpu_impl = VER_IMPL(rdpr(ver)); /* * Initialize Open Firmware (needed for console). */ OF_init(vec); /* * Parse metadata if present and fetch parameters. Must be before the * console is inited so cninit gets the right value of boothowto. */ if (mdp != NULL) { preload_metadata = mdp; kmdp = preload_search_by_type("elf kernel"); if (kmdp != NULL) { boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t); kernel_tlb_slots = MD_FETCH(kmdp, MODINFOMD_DTLB_SLOTS, int); kernel_tlbs = (void *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_DTLB); } } init_param1(); root = OF_peer(0); for (child = OF_child(root); child != 0; child = OF_peer(child)) { OF_getprop(child, "device_type", type, sizeof(type)); if (strcmp(type, "cpu") == 0) break; } /* * Initialize the tick counter. Must be before the console is inited * in order to provide the low-level console drivers with a working * DELAY(). */ OF_getprop(child, "clock-frequency", &clock, sizeof(clock)); tick_init(clock); /* * Initialize the console before printing anything. */ cninit(); /* * Panic if there is no metadata. Most likely the kernel was booted * directly, instead of through loader(8). */ if (mdp == NULL || kmdp == NULL) { printf("sparc64_init: no loader metadata.\n" "This probably means you are not using loader(8).\n"); panic("sparc64_init"); } /* * Sanity check the kernel end, which is important. */ if (end == 0) { printf("sparc64_init: warning, kernel end not specified.\n" "Attempting to continue anyway.\n"); end = (vm_offset_t)_end; } cache_init(child); uma_set_align(cache.dc_linesize - 1); cpu_block_copy = bcopy; cpu_block_zero = bzero; getenv_int("machdep.use_vis", &cpu_use_vis); if (cpu_use_vis) { switch (cpu_impl) { case CPU_IMPL_SPARC64: case CPU_IMPL_ULTRASPARCI: case CPU_IMPL_ULTRASPARCII: case CPU_IMPL_ULTRASPARCIIi: case CPU_IMPL_ULTRASPARCIIe: cpu_block_copy = spitfire_block_copy; cpu_block_zero = spitfire_block_zero; break; } } #ifdef SMP mp_init(); #endif /* * Initialize virtual memory and calculate physmem. */ pmap_bootstrap(end); /* * Initialize tunables. */ init_param2(physmem); env = getenv("kernelname"); if (env != NULL) { strlcpy(kernelname, env, sizeof(kernelname)); freeenv(env); } /* * Initialize the interrupt tables. */ intr_init1(); /* * Initialize proc0 stuff (p_contested needs to be done early). */ - proc_linkup(&proc0, &thread0); + proc_linkup0(&proc0, &thread0); proc0.p_md.md_sigtramp = NULL; proc0.p_md.md_utrap = NULL; thread0.td_kstack = kstack0; thread0.td_pcb = (struct pcb *) (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; frame0.tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_PRIV; thread0.td_frame = &frame0; /* * Prime our per-cpu data page for use. Note, we are using it for our * stack, so don't pass the real size (PAGE_SIZE) to pcpu_init or * it'll zero it out from under us. */ pc = (struct pcpu *)(pcpu0 + (PCPU_PAGES * PAGE_SIZE)) - 1; pcpu_init(pc, 0, sizeof(struct pcpu)); pc->pc_curthread = &thread0; pc->pc_curpcb = thread0.td_pcb; pc->pc_mid = UPA_CR_GET_MID(ldxa(0, ASI_UPA_CONFIG_REG)); pc->pc_addr = (vm_offset_t)pcpu0; pc->pc_node = child; pc->pc_tlb_ctx = TLB_CTX_USER_MIN; pc->pc_tlb_ctx_min = TLB_CTX_USER_MIN; pc->pc_tlb_ctx_max = TLB_CTX_USER_MAX; /* * Initialize global registers. */ cpu_setregs(pc); /* * Initialize the message buffer (after setting trap table). */ msgbufinit(msgbufp, MSGBUF_SIZE); mutex_init(); intr_init2(); /* * Finish pmap initialization now that we're ready for mutexes. */ PMAP_LOCK_INIT(kernel_pmap); OF_getprop(root, "name", sparc64_model, sizeof(sparc64_model) - 1); kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter("Boot flags requested debugger"); #endif } void set_openfirm_callback(ofw_vec_t *vec) { ofw_tba = rdpr(tba); ofw_vec = (u_long)vec; } void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct trapframe *tf; struct sigframe *sfp; struct sigacts *psp; struct sigframe sf; struct thread *td; struct frame *fp; struct proc *p; int oonstack; u_long sp; int sig; oonstack = 0; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); tf = td->td_frame; sp = tf->tf_sp + SPOFF; oonstack = sigonstack(sp); CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm, catcher, sig); /* Make sure we have a signal trampoline to return to. */ if (p->p_md.md_sigtramp == NULL) { /* * No signal trampoline... kill the process. */ CTR0(KTR_SIG, "sendsig: no sigtramp"); printf("sendsig: %s is too old, rebuild it\n", p->p_comm); sigexit(td, sig); /* NOTREACHED */ } /* Save user context. */ bzero(&sf, sizeof(sf)); get_mcontext(td, &sf.sf_uc.uc_mcontext, 0); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; /* Allocate and validate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct sigframe)); } else sfp = (struct sigframe *)sp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); fp = (struct frame *)sfp - 1; /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ tf->tf_out[0] = sig; tf->tf_out[2] = (register_t)&sfp->sf_uc; tf->tf_out[4] = (register_t)catcher; if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ tf->tf_out[1] = (register_t)&sfp->sf_si; /* Fill in POSIX parts. */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ } else { /* Old FreeBSD-style arguments. */ tf->tf_out[1] = ksi->ksi_code; tf->tf_out[3] = (register_t)ksi->ksi_addr; } /* Copy the sigframe out to the user's stack. */ if (rwindow_save(td) != 0 || copyout(&sf, sfp, sizeof(*sfp)) != 0 || suword(&fp->fr_in[6], tf->tf_out[6]) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp); PROC_LOCK(p); sigexit(td, SIGILL); /* NOTREACHED */ } tf->tf_tpc = (u_long)p->p_md.md_sigtramp; tf->tf_tnpc = tf->tf_tpc + 4; tf->tf_sp = (u_long)fp - SPOFF; CTR3(KTR_SIG, "sendsig: return td=%p pc=%#lx sp=%#lx", td, tf->tf_tpc, tf->tf_sp); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #ifndef _SYS_SYSPROTO_H_ struct sigreturn_args { ucontext_t *ucp; }; #endif /* * MPSAFE */ int sigreturn(struct thread *td, struct sigreturn_args *uap) { struct proc *p; mcontext_t *mc; ucontext_t uc; int error; p = td->td_proc; if (rwindow_save(td)) { PROC_LOCK(p); sigexit(td, SIGILL); } CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp); if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) { CTR1(KTR_SIG, "sigreturn: efault td=%p", td); return (EFAULT); } mc = &uc.uc_mcontext; error = set_mcontext(td, mc); if (error != 0) return (error); PROC_LOCK(p); td->td_sigmask = uc.uc_sigmask; SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); CTR4(KTR_SIG, "sigreturn: return td=%p pc=%#lx sp=%#lx tstate=%#lx", td, mc->mc_tpc, mc->mc_sp, mc->mc_tstate); return (EJUSTRETURN); } #ifdef COMPAT_FREEBSD4 int freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) { return sigreturn(td, (struct sigreturn_args *)uap); } #endif /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_pc = tf->tf_tpc; pcb->pcb_sp = tf->tf_sp; } int get_mcontext(struct thread *td, mcontext_t *mc, int flags) { struct trapframe *tf; struct pcb *pcb; tf = td->td_frame; pcb = td->td_pcb; bcopy(tf, mc, sizeof(*tf)); if (flags & GET_MC_CLEAR_RET) { mc->mc_out[0] = 0; mc->mc_out[1] = 0; } mc->mc_flags = _MC_VERSION; critical_enter(); if ((tf->tf_fprs & FPRS_FEF) != 0) { savefpctx(pcb->pcb_ufp); tf->tf_fprs &= ~FPRS_FEF; pcb->pcb_flags |= PCB_FEF; } if ((pcb->pcb_flags & PCB_FEF) != 0) { bcopy(pcb->pcb_ufp, mc->mc_fp, sizeof(mc->mc_fp)); mc->mc_fprs |= FPRS_FEF; } critical_exit(); return (0); } int set_mcontext(struct thread *td, const mcontext_t *mc) { struct trapframe *tf; struct pcb *pcb; uint64_t wstate; if (!TSTATE_SECURE(mc->mc_tstate) || (mc->mc_flags & ((1L << _MC_VERSION_BITS) - 1)) != _MC_VERSION) return (EINVAL); tf = td->td_frame; pcb = td->td_pcb; /* Make sure the windows are spilled first. */ flushw(); wstate = tf->tf_wstate; bcopy(mc, tf, sizeof(*tf)); tf->tf_wstate = wstate; if ((mc->mc_fprs & FPRS_FEF) != 0) { tf->tf_fprs = 0; bcopy(mc->mc_fp, pcb->pcb_ufp, sizeof(pcb->pcb_ufp)); pcb->pcb_flags |= PCB_FEF; } return (0); } /* * Exit the kernel and execute a firmware call that will not return, as * specified by the arguments. */ void cpu_shutdown(void *args) { #ifdef SMP cpu_mp_shutdown(); #endif openfirmware_exit(args); } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { return (ENXIO); } /* * Duplicate OF_exit() with a different firmware call function that restores * the trap table, otherwise a RED state exception is triggered in at least * some firmware versions. */ void cpu_halt(void) { static struct { cell_t name; cell_t nargs; cell_t nreturns; } args = { (cell_t)"exit", 0, 0 }; cpu_shutdown(&args); } void sparc64_shutdown_final(void *dummy, int howto) { static struct { cell_t name; cell_t nargs; cell_t nreturns; } args = { (cell_t)"SUNW,power-off", 0, 0 }; /* Turn the power off? */ if ((howto & RB_POWEROFF) != 0) cpu_shutdown(&args); /* In case of halt, return to the firmware */ if ((howto & RB_HALT) != 0) cpu_halt(); } void cpu_idle(void) { /* Insert code to halt (until next interrupt) for the idle loop */ } int ptrace_set_pc(struct thread *td, u_long addr) { td->td_frame->tf_tpc = addr; td->td_frame->tf_tnpc = addr + 4; return (0); } int ptrace_single_step(struct thread *td) { /* TODO; */ return (0); } int ptrace_clear_single_step(struct thread *td) { /* TODO; */ return (0); } void exec_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings) { struct trapframe *tf; struct pcb *pcb; struct proc *p; u_long sp; /* XXX no cpu_exec */ p = td->td_proc; p->p_md.md_sigtramp = NULL; if (p->p_md.md_utrap != NULL) { utrap_free(p->p_md.md_utrap); p->p_md.md_utrap = NULL; } pcb = td->td_pcb; tf = td->td_frame; sp = rounddown(stack, 16); bzero(pcb, sizeof(*pcb)); bzero(tf, sizeof(*tf)); tf->tf_out[0] = stack; tf->tf_out[3] = p->p_sysent->sv_psstrings; tf->tf_out[6] = sp - SPOFF - sizeof(struct frame); tf->tf_tnpc = entry + 4; tf->tf_tpc = entry; tf->tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_MM_TSO; td->td_retval[0] = tf->tf_out[0]; td->td_retval[1] = tf->tf_out[1]; } int fill_regs(struct thread *td, struct reg *regs) { bcopy(td->td_frame, regs, sizeof(*regs)); return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *tf; if (!TSTATE_SECURE(regs->r_tstate)) return (EINVAL); tf = td->td_frame; regs->r_wstate = tf->tf_wstate; bcopy(regs, tf, sizeof(*regs)); return (0); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { return (ENOSYS); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { return (ENOSYS); } int fill_fpregs(struct thread *td, struct fpreg *fpregs) { struct trapframe *tf; struct pcb *pcb; pcb = td->td_pcb; tf = td->td_frame; bcopy(pcb->pcb_ufp, fpregs->fr_regs, sizeof(fpregs->fr_regs)); fpregs->fr_fsr = tf->tf_fsr; fpregs->fr_gsr = tf->tf_gsr; return (0); } int set_fpregs(struct thread *td, struct fpreg *fpregs) { struct trapframe *tf; struct pcb *pcb; pcb = td->td_pcb; tf = td->td_frame; tf->tf_fprs &= ~FPRS_FEF; bcopy(fpregs->fr_regs, pcb->pcb_ufp, sizeof(pcb->pcb_ufp)); tf->tf_fsr = fpregs->fr_fsr; tf->tf_gsr = fpregs->fr_gsr; return (0); } struct md_utrap * utrap_alloc(void) { struct md_utrap *ut; ut = malloc(sizeof(struct md_utrap), M_SUBPROC, M_WAITOK | M_ZERO); ut->ut_refcnt = 1; return (ut); } void utrap_free(struct md_utrap *ut) { int refcnt; if (ut == NULL) return; mtx_pool_lock(mtxpool_sleep, ut); ut->ut_refcnt--; refcnt = ut->ut_refcnt; mtx_pool_unlock(mtxpool_sleep, ut); if (refcnt == 0) free(ut, M_SUBPROC); } struct md_utrap * utrap_hold(struct md_utrap *ut) { if (ut == NULL) return (NULL); mtx_pool_lock(mtxpool_sleep, ut); ut->ut_refcnt++; mtx_pool_unlock(mtxpool_sleep, ut); return (ut); } Index: head/sys/sparc64/sparc64/pmap.c =================================================================== --- head/sys/sparc64/sparc64/pmap.c (revision 173360) +++ head/sys/sparc64/sparc64/pmap.c (revision 173361) @@ -1,1955 +1,1960 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * $FreeBSD$ */ /* * Manages physical address maps. * * In addition to hardware address maps, this module is called upon to * provide software-use-only maps which may or may not be stored in the * same form as hardware maps. These pseudo-maps are used to store * intermediate results from copy operations to and from address spaces. * * Since the information managed by this module is also stored by the * logical address mapping module, this module may throw away valid virtual * to physical mappings at almost any time. However, invalidations of * mappings must be done as requested. * * In order to cope with hardware architectures which make virtual to * physical map invalidates expensive, this module may delay invalidate * reduced protection operations until such time as they are actually * necessary. This module is given full information as to which processors * are currently using which maps, and to when physical maps must be made * correct. */ #include "opt_kstack_pages.h" #include "opt_msgbuf.h" #include "opt_pmap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define PMAP_DEBUG #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif /* XXX */ #include "opt_sched.h" #ifndef SCHED_4BSD #error "sparc64 only works with SCHED_4BSD which uses a global scheduler lock." #endif extern struct mtx sched_lock; /* * Virtual and physical address of message buffer. */ struct msgbuf *msgbufp; vm_paddr_t msgbuf_phys; /* * Map of physical memory reagions. */ vm_paddr_t phys_avail[128]; static struct ofw_mem_region mra[128]; struct ofw_mem_region sparc64_memreg[128]; int sparc64_nmemreg; static struct ofw_map translations[128]; static int translations_size; static vm_offset_t pmap_idle_map; static vm_offset_t pmap_temp_map_1; static vm_offset_t pmap_temp_map_2; /* * First and last available kernel virtual addresses. */ vm_offset_t virtual_avail; vm_offset_t virtual_end; vm_offset_t kernel_vm_end; vm_offset_t vm_max_kernel_address; /* * Kernel pmap. */ struct pmap kernel_pmap_store; /* * Allocate physical memory for use in pmap_bootstrap. */ static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size); /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. * * The page queues and pmap must be locked. */ static void pmap_enter_locked(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired); extern int tl1_immu_miss_patch_1[]; extern int tl1_immu_miss_patch_2[]; extern int tl1_dmmu_miss_patch_1[]; extern int tl1_dmmu_miss_patch_2[]; extern int tl1_dmmu_prot_patch_1[]; extern int tl1_dmmu_prot_patch_2[]; /* * If user pmap is processed with pmap_remove and with pmap_remove and the * resident count drops to 0, there are no more pages to remove, so we * need not continue. */ #define PMAP_REMOVE_DONE(pm) \ ((pm) != kernel_pmap && (pm)->pm_stats.resident_count == 0) /* * The threshold (in bytes) above which tsb_foreach() is used in pmap_remove() * and pmap_protect() instead of trying each virtual address. */ #define PMAP_TSB_THRESH ((TSB_SIZE / 2) * PAGE_SIZE) SYSCTL_NODE(_debug, OID_AUTO, pmap_stats, CTLFLAG_RD, 0, ""); PMAP_STATS_VAR(pmap_nenter); PMAP_STATS_VAR(pmap_nenter_update); PMAP_STATS_VAR(pmap_nenter_replace); PMAP_STATS_VAR(pmap_nenter_new); PMAP_STATS_VAR(pmap_nkenter); PMAP_STATS_VAR(pmap_nkenter_oc); PMAP_STATS_VAR(pmap_nkenter_stupid); PMAP_STATS_VAR(pmap_nkremove); PMAP_STATS_VAR(pmap_nqenter); PMAP_STATS_VAR(pmap_nqremove); PMAP_STATS_VAR(pmap_ncache_enter); PMAP_STATS_VAR(pmap_ncache_enter_c); PMAP_STATS_VAR(pmap_ncache_enter_oc); PMAP_STATS_VAR(pmap_ncache_enter_cc); PMAP_STATS_VAR(pmap_ncache_enter_coc); PMAP_STATS_VAR(pmap_ncache_enter_nc); PMAP_STATS_VAR(pmap_ncache_enter_cnc); PMAP_STATS_VAR(pmap_ncache_remove); PMAP_STATS_VAR(pmap_ncache_remove_c); PMAP_STATS_VAR(pmap_ncache_remove_oc); PMAP_STATS_VAR(pmap_ncache_remove_cc); PMAP_STATS_VAR(pmap_ncache_remove_coc); PMAP_STATS_VAR(pmap_ncache_remove_nc); PMAP_STATS_VAR(pmap_nzero_page); PMAP_STATS_VAR(pmap_nzero_page_c); PMAP_STATS_VAR(pmap_nzero_page_oc); PMAP_STATS_VAR(pmap_nzero_page_nc); PMAP_STATS_VAR(pmap_nzero_page_area); PMAP_STATS_VAR(pmap_nzero_page_area_c); PMAP_STATS_VAR(pmap_nzero_page_area_oc); PMAP_STATS_VAR(pmap_nzero_page_area_nc); PMAP_STATS_VAR(pmap_nzero_page_idle); PMAP_STATS_VAR(pmap_nzero_page_idle_c); PMAP_STATS_VAR(pmap_nzero_page_idle_oc); PMAP_STATS_VAR(pmap_nzero_page_idle_nc); PMAP_STATS_VAR(pmap_ncopy_page); PMAP_STATS_VAR(pmap_ncopy_page_c); PMAP_STATS_VAR(pmap_ncopy_page_oc); PMAP_STATS_VAR(pmap_ncopy_page_nc); PMAP_STATS_VAR(pmap_ncopy_page_dc); PMAP_STATS_VAR(pmap_ncopy_page_doc); PMAP_STATS_VAR(pmap_ncopy_page_sc); PMAP_STATS_VAR(pmap_ncopy_page_soc); PMAP_STATS_VAR(pmap_nnew_thread); PMAP_STATS_VAR(pmap_nnew_thread_oc); /* * Quick sort callout for comparing memory regions. */ static int mr_cmp(const void *a, const void *b); static int om_cmp(const void *a, const void *b); static int mr_cmp(const void *a, const void *b) { const struct ofw_mem_region *mra; const struct ofw_mem_region *mrb; mra = a; mrb = b; if (mra->mr_start < mrb->mr_start) return (-1); else if (mra->mr_start > mrb->mr_start) return (1); else return (0); } static int om_cmp(const void *a, const void *b) { const struct ofw_map *oma; const struct ofw_map *omb; oma = a; omb = b; if (oma->om_start < omb->om_start) return (-1); else if (oma->om_start > omb->om_start) return (1); else return (0); } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t ekva) { struct pmap *pm; struct tte *tp; vm_offset_t off; vm_offset_t va; vm_paddr_t pa; vm_size_t physsz; vm_size_t virtsz; ihandle_t pmem; ihandle_t vmem; int sz; int i; int j; /* * Find out what physical memory is available from the prom and * initialize the phys_avail array. This must be done before * pmap_bootstrap_alloc is called. */ if ((pmem = OF_finddevice("/memory")) == -1) panic("pmap_bootstrap: finddevice /memory"); if ((sz = OF_getproplen(pmem, "available")) == -1) panic("pmap_bootstrap: getproplen /memory/available"); if (sizeof(phys_avail) < sz) panic("pmap_bootstrap: phys_avail too small"); if (sizeof(mra) < sz) panic("pmap_bootstrap: mra too small"); bzero(mra, sz); if (OF_getprop(pmem, "available", mra, sz) == -1) panic("pmap_bootstrap: getprop /memory/available"); sz /= sizeof(*mra); CTR0(KTR_PMAP, "pmap_bootstrap: physical memory"); qsort(mra, sz, sizeof (*mra), mr_cmp); physsz = 0; getenv_quad("hw.physmem", &physmem); physmem = btoc(physmem); for (i = 0, j = 0; i < sz; i++, j += 2) { CTR2(KTR_PMAP, "start=%#lx size=%#lx", mra[i].mr_start, mra[i].mr_size); if (physmem != 0 && btoc(physsz + mra[i].mr_size) >= physmem) { if (btoc(physsz) < physmem) { phys_avail[j] = mra[i].mr_start; phys_avail[j + 1] = mra[i].mr_start + (ctob(physmem) - physsz); physsz = ctob(physmem); } break; } phys_avail[j] = mra[i].mr_start; phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size; physsz += mra[i].mr_size; } physmem = btoc(physsz); /* * Calculate the size of kernel virtual memory, and the size and mask * for the kernel tsb. */ virtsz = roundup(physsz, PAGE_SIZE_4M << (PAGE_SHIFT - TTE_SHIFT)); vm_max_kernel_address = VM_MIN_KERNEL_ADDRESS + virtsz; tsb_kernel_size = virtsz >> (PAGE_SHIFT - TTE_SHIFT); tsb_kernel_mask = (tsb_kernel_size >> TTE_SHIFT) - 1; /* * Allocate the kernel tsb and lock it in the tlb. */ pa = pmap_bootstrap_alloc(tsb_kernel_size); if (pa & PAGE_MASK_4M) panic("pmap_bootstrap: tsb unaligned\n"); tsb_kernel_phys = pa; tsb_kernel = (struct tte *)(VM_MIN_KERNEL_ADDRESS - tsb_kernel_size); pmap_map_tsb(); bzero(tsb_kernel, tsb_kernel_size); /* * Allocate and map the message buffer. */ msgbuf_phys = pmap_bootstrap_alloc(MSGBUF_SIZE); msgbufp = (struct msgbuf *)TLB_PHYS_TO_DIRECT(msgbuf_phys); /* * Patch the virtual address and the tsb mask into the trap table. */ #define SETHI(rd, imm22) \ (EIF_OP(IOP_FORM2) | EIF_F2_RD(rd) | EIF_F2_OP2(INS0_SETHI) | \ EIF_IMM((imm22) >> 10, 22)) #define OR_R_I_R(rd, imm13, rs1) \ (EIF_OP(IOP_MISC) | EIF_F3_RD(rd) | EIF_F3_OP3(INS2_OR) | \ EIF_F3_RS1(rs1) | EIF_F3_I(1) | EIF_IMM(imm13, 13)) #define PATCH(addr) do { \ if (addr[0] != SETHI(IF_F2_RD(addr[0]), 0x0) || \ addr[1] != OR_R_I_R(IF_F3_RD(addr[1]), 0x0, IF_F3_RS1(addr[1])) || \ addr[2] != SETHI(IF_F2_RD(addr[2]), 0x0)) \ panic("pmap_boostrap: patched instructions have changed"); \ addr[0] |= EIF_IMM((tsb_kernel_mask) >> 10, 22); \ addr[1] |= EIF_IMM(tsb_kernel_mask, 10); \ addr[2] |= EIF_IMM(((vm_offset_t)tsb_kernel) >> 10, 22); \ flush(addr); \ flush(addr + 1); \ flush(addr + 2); \ } while (0) PATCH(tl1_immu_miss_patch_1); PATCH(tl1_immu_miss_patch_2); PATCH(tl1_dmmu_miss_patch_1); PATCH(tl1_dmmu_miss_patch_2); PATCH(tl1_dmmu_prot_patch_1); PATCH(tl1_dmmu_prot_patch_2); /* * Enter fake 8k pages for the 4MB kernel pages, so that * pmap_kextract() will work for them. */ for (i = 0; i < kernel_tlb_slots; i++) { pa = kernel_tlbs[i].te_pa; va = kernel_tlbs[i].te_va; for (off = 0; off < PAGE_SIZE_4M; off += PAGE_SIZE) { tp = tsb_kvtotte(va + off); tp->tte_vpn = TV_VPN(va + off, TS_8K); tp->tte_data = TD_V | TD_8K | TD_PA(pa + off) | TD_REF | TD_SW | TD_CP | TD_CV | TD_P | TD_W; } } /* * Set the start and end of kva. The kernel is loaded at the first * available 4 meg super page, so round up to the end of the page. */ virtual_avail = roundup2(ekva, PAGE_SIZE_4M); virtual_end = vm_max_kernel_address; kernel_vm_end = vm_max_kernel_address; /* * Allocate kva space for temporary mappings. */ pmap_idle_map = virtual_avail; virtual_avail += PAGE_SIZE * DCACHE_COLORS; pmap_temp_map_1 = virtual_avail; virtual_avail += PAGE_SIZE * DCACHE_COLORS; pmap_temp_map_2 = virtual_avail; virtual_avail += PAGE_SIZE * DCACHE_COLORS; /* * Allocate a kernel stack with guard page for thread0 and map it into * the kernel tsb. We must ensure that the virtual address is coloured * properly, since we're allocating from phys_avail so the memory won't * have an associated vm_page_t. */ pa = pmap_bootstrap_alloc(roundup(KSTACK_PAGES, DCACHE_COLORS) * PAGE_SIZE); kstack0_phys = pa; virtual_avail += roundup(KSTACK_GUARD_PAGES, DCACHE_COLORS) * PAGE_SIZE; kstack0 = virtual_avail; virtual_avail += roundup(KSTACK_PAGES, DCACHE_COLORS) * PAGE_SIZE; KASSERT(DCACHE_COLOR(kstack0) == DCACHE_COLOR(kstack0_phys), ("pmap_bootstrap: kstack0 miscoloured")); for (i = 0; i < KSTACK_PAGES; i++) { pa = kstack0_phys + i * PAGE_SIZE; va = kstack0 + i * PAGE_SIZE; tp = tsb_kvtotte(va); tp->tte_vpn = TV_VPN(va, TS_8K); tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_REF | TD_SW | TD_CP | TD_CV | TD_P | TD_W; } /* * Calculate the last available physical address. */ for (i = 0; phys_avail[i + 2] != 0; i += 2) ; Maxmem = sparc64_btop(phys_avail[i + 1]); /* * Add the prom mappings to the kernel tsb. */ if ((vmem = OF_finddevice("/virtual-memory")) == -1) panic("pmap_bootstrap: finddevice /virtual-memory"); if ((sz = OF_getproplen(vmem, "translations")) == -1) panic("pmap_bootstrap: getproplen translations"); if (sizeof(translations) < sz) panic("pmap_bootstrap: translations too small"); bzero(translations, sz); if (OF_getprop(vmem, "translations", translations, sz) == -1) panic("pmap_bootstrap: getprop /virtual-memory/translations"); sz /= sizeof(*translations); translations_size = sz; CTR0(KTR_PMAP, "pmap_bootstrap: translations"); qsort(translations, sz, sizeof (*translations), om_cmp); for (i = 0; i < sz; i++) { CTR3(KTR_PMAP, "translation: start=%#lx size=%#lx tte=%#lx", translations[i].om_start, translations[i].om_size, translations[i].om_tte); if (translations[i].om_start < VM_MIN_PROM_ADDRESS || translations[i].om_start > VM_MAX_PROM_ADDRESS) continue; for (off = 0; off < translations[i].om_size; off += PAGE_SIZE) { va = translations[i].om_start + off; tp = tsb_kvtotte(va); tp->tte_vpn = TV_VPN(va, TS_8K); tp->tte_data = ((translations[i].om_tte & ~(TD_SOFT_MASK << TD_SOFT_SHIFT)) | TD_EXEC) + off; } } /* * Get the available physical memory ranges from /memory/reg. These * are only used for kernel dumps, but it may not be wise to do prom * calls in that situation. */ if ((sz = OF_getproplen(pmem, "reg")) == -1) panic("pmap_bootstrap: getproplen /memory/reg"); if (sizeof(sparc64_memreg) < sz) panic("pmap_bootstrap: sparc64_memreg too small"); if (OF_getprop(pmem, "reg", sparc64_memreg, sz) == -1) panic("pmap_bootstrap: getprop /memory/reg"); sparc64_nmemreg = sz / sizeof(*sparc64_memreg); /* * Initialize the kernel pmap (which is statically allocated). * NOTE: PMAP_LOCK_INIT() is needed as part of the initialization * but sparc64 start up is not ready to initialize mutexes yet. * It is called in machdep.c. */ pm = kernel_pmap; for (i = 0; i < MAXCPU; i++) pm->pm_context[i] = TLB_CTX_KERNEL; pm->pm_active = ~0; /* XXX flush all non-locked tlb entries */ } void pmap_map_tsb(void) { vm_offset_t va; vm_paddr_t pa; u_long data; u_long s; int i; s = intr_disable(); /* * Map the 4mb tsb pages. */ for (i = 0; i < tsb_kernel_size; i += PAGE_SIZE_4M) { va = (vm_offset_t)tsb_kernel + i; pa = tsb_kernel_phys + i; data = TD_V | TD_4M | TD_PA(pa) | TD_L | TD_CP | TD_CV | TD_P | TD_W; /* XXX - cheetah */ stxa(AA_DMMU_TAR, ASI_DMMU, TLB_TAR_VA(va) | TLB_TAR_CTX(TLB_CTX_KERNEL)); stxa_sync(0, ASI_DTLB_DATA_IN_REG, data); } /* * Set the secondary context to be the kernel context (needed for * fp block operations in the kernel and the cache code). */ stxa(AA_DMMU_SCXR, ASI_DMMU, TLB_CTX_KERNEL); membar(Sync); intr_restore(s); } /* * Allocate a physical page of memory directly from the phys_avail map. * Can only be called from pmap_bootstrap before avail start and end are * calculated. */ static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size) { vm_paddr_t pa; int i; size = round_page(size); for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (phys_avail[i + 1] - phys_avail[i] < size) continue; pa = phys_avail[i]; phys_avail[i] += size; return (pa); } panic("pmap_bootstrap_alloc"); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.tte_list); m->md.color = DCACHE_COLOR(VM_PAGE_TO_PHYS(m)); m->md.flags = 0; m->md.pmap = NULL; } /* * Initialize the pmap module. */ void pmap_init(void) { vm_offset_t addr; vm_size_t size; int result; int i; for (i = 0; i < translations_size; i++) { addr = translations[i].om_start; size = translations[i].om_size; if (addr < VM_MIN_PROM_ADDRESS || addr > VM_MAX_PROM_ADDRESS) continue; result = vm_map_find(kernel_map, NULL, 0, &addr, size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (result != KERN_SUCCESS || addr != translations[i].om_start) panic("pmap_init: vm_map_find"); } } /* * Extract the physical page address associated with the given * map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pm, vm_offset_t va) { struct tte *tp; vm_paddr_t pa; if (pm == kernel_pmap) return (pmap_kextract(va)); PMAP_LOCK(pm); tp = tsb_tte_lookup(pm, va); if (tp == NULL) pa = 0; else pa = TTE_GET_PA(tp) | (va & TTE_GET_PAGE_MASK(tp)); PMAP_UNLOCK(pm); return (pa); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. */ vm_page_t pmap_extract_and_hold(pmap_t pm, vm_offset_t va, vm_prot_t prot) { struct tte *tp; vm_page_t m; m = NULL; vm_page_lock_queues(); if (pm == kernel_pmap) { if (va >= VM_MIN_DIRECT_ADDRESS) { tp = NULL; m = PHYS_TO_VM_PAGE(TLB_DIRECT_TO_PHYS(va)); vm_page_hold(m); } else { tp = tsb_kvtotte(va); if ((tp->tte_data & TD_V) == 0) tp = NULL; } } else { PMAP_LOCK(pm); tp = tsb_tte_lookup(pm, va); } if (tp != NULL && ((tp->tte_data & TD_SW) || (prot & VM_PROT_WRITE) == 0)) { m = PHYS_TO_VM_PAGE(TTE_GET_PA(tp)); vm_page_hold(m); } vm_page_unlock_queues(); if (pm != kernel_pmap) PMAP_UNLOCK(pm); return (m); } /* * Extract the physical page address associated with the given kernel virtual * address. */ vm_paddr_t pmap_kextract(vm_offset_t va) { struct tte *tp; if (va >= VM_MIN_DIRECT_ADDRESS) return (TLB_DIRECT_TO_PHYS(va)); tp = tsb_kvtotte(va); if ((tp->tte_data & TD_V) == 0) return (0); return (TTE_GET_PA(tp) | (va & TTE_GET_PAGE_MASK(tp))); } int pmap_cache_enter(vm_page_t m, vm_offset_t va) { struct tte *tp; int color; mtx_assert(&vm_page_queue_mtx, MA_OWNED); KASSERT((m->flags & PG_FICTITIOUS) == 0, ("pmap_cache_enter: fake page")); PMAP_STATS_INC(pmap_ncache_enter); /* * Find the color for this virtual address and note the added mapping. */ color = DCACHE_COLOR(va); m->md.colors[color]++; /* * If all existing mappings have the same color, the mapping is * cacheable. */ if (m->md.color == color) { KASSERT(m->md.colors[DCACHE_OTHER_COLOR(color)] == 0, ("pmap_cache_enter: cacheable, mappings of other color")); if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m))) PMAP_STATS_INC(pmap_ncache_enter_c); else PMAP_STATS_INC(pmap_ncache_enter_oc); return (1); } /* * If there are no mappings of the other color, and the page still has * the wrong color, this must be a new mapping. Change the color to * match the new mapping, which is cacheable. We must flush the page * from the cache now. */ if (m->md.colors[DCACHE_OTHER_COLOR(color)] == 0) { KASSERT(m->md.colors[color] == 1, ("pmap_cache_enter: changing color, not new mapping")); dcache_page_inval(VM_PAGE_TO_PHYS(m)); m->md.color = color; if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m))) PMAP_STATS_INC(pmap_ncache_enter_cc); else PMAP_STATS_INC(pmap_ncache_enter_coc); return (1); } /* * If the mapping is already non-cacheable, just return. */ if (m->md.color == -1) { PMAP_STATS_INC(pmap_ncache_enter_nc); return (0); } PMAP_STATS_INC(pmap_ncache_enter_cnc); /* * Mark all mappings as uncacheable, flush any lines with the other * color out of the dcache, and set the color to none (-1). */ TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { atomic_clear_long(&tp->tte_data, TD_CV); tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp)); } dcache_page_inval(VM_PAGE_TO_PHYS(m)); m->md.color = -1; return (0); } void pmap_cache_remove(vm_page_t m, vm_offset_t va) { struct tte *tp; int color; mtx_assert(&vm_page_queue_mtx, MA_OWNED); CTR3(KTR_PMAP, "pmap_cache_remove: m=%p va=%#lx c=%d", m, va, m->md.colors[DCACHE_COLOR(va)]); KASSERT((m->flags & PG_FICTITIOUS) == 0, ("pmap_cache_remove: fake page")); KASSERT(m->md.colors[DCACHE_COLOR(va)] > 0, ("pmap_cache_remove: no mappings %d <= 0", m->md.colors[DCACHE_COLOR(va)])); PMAP_STATS_INC(pmap_ncache_remove); /* * Find the color for this virtual address and note the removal of * the mapping. */ color = DCACHE_COLOR(va); m->md.colors[color]--; /* * If the page is cacheable, just return and keep the same color, even * if there are no longer any mappings. */ if (m->md.color != -1) { if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m))) PMAP_STATS_INC(pmap_ncache_remove_c); else PMAP_STATS_INC(pmap_ncache_remove_oc); return; } KASSERT(m->md.colors[DCACHE_OTHER_COLOR(color)] != 0, ("pmap_cache_remove: uncacheable, no mappings of other color")); /* * If the page is not cacheable (color is -1), and the number of * mappings for this color is not zero, just return. There are * mappings of the other color still, so remain non-cacheable. */ if (m->md.colors[color] != 0) { PMAP_STATS_INC(pmap_ncache_remove_nc); return; } /* * The number of mappings for this color is now zero. Recache the * other colored mappings, and change the page color to the other * color. There should be no lines in the data cache for this page, * so flushing should not be needed. */ TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { atomic_set_long(&tp->tte_data, TD_CV); tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp)); } m->md.color = DCACHE_OTHER_COLOR(color); if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m))) PMAP_STATS_INC(pmap_ncache_remove_cc); else PMAP_STATS_INC(pmap_ncache_remove_coc); } /* * Map a wired page into kernel virtual address space. */ void pmap_kenter(vm_offset_t va, vm_page_t m) { vm_offset_t ova; struct tte *tp; vm_page_t om; u_long data; mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_STATS_INC(pmap_nkenter); tp = tsb_kvtotte(va); CTR4(KTR_PMAP, "pmap_kenter: va=%#lx pa=%#lx tp=%p data=%#lx", va, VM_PAGE_TO_PHYS(m), tp, tp->tte_data); if (DCACHE_COLOR(VM_PAGE_TO_PHYS(m)) != DCACHE_COLOR(va)) { CTR6(KTR_CT2, "pmap_kenter: off colour va=%#lx pa=%#lx o=%p oc=%#lx ot=%d pi=%#lx", va, VM_PAGE_TO_PHYS(m), m->object, m->object ? m->object->pg_color : -1, m->object ? m->object->type : -1, m->pindex); PMAP_STATS_INC(pmap_nkenter_oc); } if ((tp->tte_data & TD_V) != 0) { om = PHYS_TO_VM_PAGE(TTE_GET_PA(tp)); ova = TTE_GET_VA(tp); if (m == om && va == ova) { PMAP_STATS_INC(pmap_nkenter_stupid); return; } TAILQ_REMOVE(&om->md.tte_list, tp, tte_link); pmap_cache_remove(om, ova); if (va != ova) tlb_page_demap(kernel_pmap, ova); } data = TD_V | TD_8K | VM_PAGE_TO_PHYS(m) | TD_REF | TD_SW | TD_CP | TD_P | TD_W; if (pmap_cache_enter(m, va) != 0) data |= TD_CV; tp->tte_vpn = TV_VPN(va, TS_8K); tp->tte_data = data; TAILQ_INSERT_TAIL(&m->md.tte_list, tp, tte_link); } /* * Map a wired page into kernel virtual address space. This additionally * takes a flag argument wich is or'ed to the TTE data. This is used by * bus_space_map(). * NOTE: if the mapping is non-cacheable, it's the caller's responsibility * to flush entries that might still be in the cache, if applicable. */ void pmap_kenter_flags(vm_offset_t va, vm_paddr_t pa, u_long flags) { struct tte *tp; tp = tsb_kvtotte(va); CTR4(KTR_PMAP, "pmap_kenter_flags: va=%#lx pa=%#lx tp=%p data=%#lx", va, pa, tp, tp->tte_data); tp->tte_vpn = TV_VPN(va, TS_8K); tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_REF | TD_P | flags; } /* * Remove a wired page from kernel virtual address space. */ void pmap_kremove(vm_offset_t va) { struct tte *tp; vm_page_t m; mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_STATS_INC(pmap_nkremove); tp = tsb_kvtotte(va); CTR3(KTR_PMAP, "pmap_kremove: va=%#lx tp=%p data=%#lx", va, tp, tp->tte_data); if ((tp->tte_data & TD_V) == 0) return; m = PHYS_TO_VM_PAGE(TTE_GET_PA(tp)); TAILQ_REMOVE(&m->md.tte_list, tp, tte_link); pmap_cache_remove(m, va); TTE_ZERO(tp); } /* * Inverse of pmap_kenter_flags, used by bus_space_unmap(). */ void pmap_kremove_flags(vm_offset_t va) { struct tte *tp; tp = tsb_kvtotte(va); CTR3(KTR_PMAP, "pmap_kremove: va=%#lx tp=%p data=%#lx", va, tp, tp->tte_data); TTE_ZERO(tp); } /* * Map a range of physical addresses into kernel virtual address space. * * The value passed in *virt is a suggested virtual address for the mapping. * Architectures which can support a direct-mapped physical to virtual region * can return the appropriate address within that region, leaving '*virt' * unchanged. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return (TLB_PHYS_TO_DIRECT(start)); } /* * Map a list of wired pages into kernel virtual address space. This is * intended for temporary mappings which do not need page modification or * references recorded. Existing mappings in the region are overwritten. */ void pmap_qenter(vm_offset_t sva, vm_page_t *m, int count) { vm_offset_t va; int locked; PMAP_STATS_INC(pmap_nqenter); va = sva; if (!(locked = mtx_owned(&vm_page_queue_mtx))) vm_page_lock_queues(); while (count-- > 0) { pmap_kenter(va, *m); va += PAGE_SIZE; m++; } if (!locked) vm_page_unlock_queues(); tlb_range_demap(kernel_pmap, sva, va); } /* * Remove page mappings from kernel virtual address space. Intended for * temporary mappings entered by pmap_qenter. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; int locked; PMAP_STATS_INC(pmap_nqremove); va = sva; if (!(locked = mtx_owned(&vm_page_queue_mtx))) vm_page_lock_queues(); while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } if (!locked) vm_page_unlock_queues(); tlb_range_demap(kernel_pmap, sva, va); } /* * Initialize the pmap associated with process 0. */ void pmap_pinit0(pmap_t pm) { int i; PMAP_LOCK_INIT(pm); for (i = 0; i < MAXCPU; i++) pm->pm_context[i] = 0; pm->pm_active = 0; pm->pm_tsb = NULL; pm->pm_tsb_obj = NULL; bzero(&pm->pm_stats, sizeof(pm->pm_stats)); } /* * Initialize a preallocated and zeroed pmap structure, such as one in a * vmspace structure. */ -void +int pmap_pinit(pmap_t pm) { vm_page_t ma[TSB_PAGES]; vm_page_t m; int i; PMAP_LOCK_INIT(pm); /* * Allocate kva space for the tsb. */ if (pm->pm_tsb == NULL) { pm->pm_tsb = (struct tte *)kmem_alloc_nofault(kernel_map, TSB_BSIZE); + if (pm->pm_tsb == NULL) { + PMAP_LOCK_DESTROY(pm); + return (0); + } } /* * Allocate an object for it. */ if (pm->pm_tsb_obj == NULL) pm->pm_tsb_obj = vm_object_allocate(OBJT_DEFAULT, TSB_PAGES); VM_OBJECT_LOCK(pm->pm_tsb_obj); for (i = 0; i < TSB_PAGES; i++) { m = vm_page_grab(pm->pm_tsb_obj, i, VM_ALLOC_NOBUSY | VM_ALLOC_RETRY | VM_ALLOC_WIRED | VM_ALLOC_ZERO); m->valid = VM_PAGE_BITS_ALL; m->md.pmap = pm; ma[i] = m; } VM_OBJECT_UNLOCK(pm->pm_tsb_obj); pmap_qenter((vm_offset_t)pm->pm_tsb, ma, TSB_PAGES); for (i = 0; i < MAXCPU; i++) pm->pm_context[i] = -1; pm->pm_active = 0; bzero(&pm->pm_stats, sizeof(pm->pm_stats)); + return (1); } /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pm) { vm_object_t obj; vm_page_t m; struct pcpu *pc; CTR2(KTR_PMAP, "pmap_release: ctx=%#x tsb=%p", pm->pm_context[PCPU_GET(cpuid)], pm->pm_tsb); KASSERT(pmap_resident_count(pm) == 0, ("pmap_release: resident pages %ld != 0", pmap_resident_count(pm))); /* * After the pmap was freed, it might be reallocated to a new process. * When switching, this might lead us to wrongly assume that we need * not switch contexts because old and new pmap pointer are equal. * Therefore, make sure that this pmap is not referenced by any PCPU * pointer any more. This could happen in two cases: * - A process that referenced the pmap is currently exiting on a CPU. * However, it is guaranteed to not switch in any more after setting * its state to PRS_ZOMBIE. * - A process that referenced this pmap ran on a CPU, but we switched * to a kernel thread, leaving the pmap pointer unchanged. */ mtx_lock_spin(&sched_lock); SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { if (pc->pc_pmap == pm) pc->pc_pmap = NULL; } mtx_unlock_spin(&sched_lock); obj = pm->pm_tsb_obj; VM_OBJECT_LOCK(obj); KASSERT(obj->ref_count == 1, ("pmap_release: tsbobj ref count != 1")); while (!TAILQ_EMPTY(&obj->memq)) { m = TAILQ_FIRST(&obj->memq); vm_page_lock_queues(); if (vm_page_sleep_if_busy(m, FALSE, "pmaprl")) continue; KASSERT(m->hold_count == 0, ("pmap_release: freeing held tsb page")); m->md.pmap = NULL; m->wire_count--; atomic_subtract_int(&cnt.v_wire_count, 1); vm_page_free_zero(m); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(obj); pmap_qremove((vm_offset_t)pm->pm_tsb, TSB_PAGES); PMAP_LOCK_DESTROY(pm); } /* * Grow the number of kernel page table entries. Unneeded. */ void pmap_growkernel(vm_offset_t addr) { panic("pmap_growkernel: can't grow kernel"); } int pmap_remove_tte(struct pmap *pm, struct pmap *pm2, struct tte *tp, vm_offset_t va) { vm_page_t m; u_long data; mtx_assert(&vm_page_queue_mtx, MA_OWNED); data = atomic_readandclear_long(&tp->tte_data); if ((data & TD_FAKE) == 0) { m = PHYS_TO_VM_PAGE(TD_PA(data)); TAILQ_REMOVE(&m->md.tte_list, tp, tte_link); if ((data & TD_WIRED) != 0) pm->pm_stats.wired_count--; if ((data & TD_PV) != 0) { if ((data & TD_W) != 0) vm_page_dirty(m); if ((data & TD_REF) != 0) vm_page_flag_set(m, PG_REFERENCED); if (TAILQ_EMPTY(&m->md.tte_list)) vm_page_flag_clear(m, PG_WRITEABLE); pm->pm_stats.resident_count--; } pmap_cache_remove(m, va); } TTE_ZERO(tp); if (PMAP_REMOVE_DONE(pm)) return (0); return (1); } /* * Remove the given range of addresses from the specified map. */ void pmap_remove(pmap_t pm, vm_offset_t start, vm_offset_t end) { struct tte *tp; vm_offset_t va; CTR3(KTR_PMAP, "pmap_remove: ctx=%#lx start=%#lx end=%#lx", pm->pm_context[PCPU_GET(cpuid)], start, end); if (PMAP_REMOVE_DONE(pm)) return; vm_page_lock_queues(); PMAP_LOCK(pm); if (end - start > PMAP_TSB_THRESH) { tsb_foreach(pm, NULL, start, end, pmap_remove_tte); tlb_context_demap(pm); } else { for (va = start; va < end; va += PAGE_SIZE) { if ((tp = tsb_tte_lookup(pm, va)) != NULL) { if (!pmap_remove_tte(pm, NULL, tp, va)) break; } } tlb_range_demap(pm, start, end - 1); } PMAP_UNLOCK(pm); vm_page_unlock_queues(); } void pmap_remove_all(vm_page_t m) { struct pmap *pm; struct tte *tpn; struct tte *tp; vm_offset_t va; mtx_assert(&vm_page_queue_mtx, MA_OWNED); for (tp = TAILQ_FIRST(&m->md.tte_list); tp != NULL; tp = tpn) { tpn = TAILQ_NEXT(tp, tte_link); if ((tp->tte_data & TD_PV) == 0) continue; pm = TTE_GET_PMAP(tp); va = TTE_GET_VA(tp); PMAP_LOCK(pm); if ((tp->tte_data & TD_WIRED) != 0) pm->pm_stats.wired_count--; if ((tp->tte_data & TD_REF) != 0) vm_page_flag_set(m, PG_REFERENCED); if ((tp->tte_data & TD_W) != 0) vm_page_dirty(m); tp->tte_data &= ~TD_V; tlb_page_demap(pm, va); TAILQ_REMOVE(&m->md.tte_list, tp, tte_link); pm->pm_stats.resident_count--; pmap_cache_remove(m, va); TTE_ZERO(tp); PMAP_UNLOCK(pm); } vm_page_flag_clear(m, PG_WRITEABLE); } int pmap_protect_tte(struct pmap *pm, struct pmap *pm2, struct tte *tp, vm_offset_t va) { u_long data; vm_page_t m; data = atomic_clear_long(&tp->tte_data, TD_REF | TD_SW | TD_W); if ((data & TD_PV) != 0) { m = PHYS_TO_VM_PAGE(TD_PA(data)); if ((data & TD_REF) != 0) vm_page_flag_set(m, PG_REFERENCED); if ((data & TD_W) != 0) vm_page_dirty(m); } return (1); } /* * Set the physical protection on the specified range of this map as requested. */ void pmap_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va; struct tte *tp; CTR4(KTR_PMAP, "pmap_protect: ctx=%#lx sva=%#lx eva=%#lx prot=%#lx", pm->pm_context[PCPU_GET(cpuid)], sva, eva, prot); if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pm, sva, eva); return; } if (prot & VM_PROT_WRITE) return; vm_page_lock_queues(); PMAP_LOCK(pm); if (eva - sva > PMAP_TSB_THRESH) { tsb_foreach(pm, NULL, sva, eva, pmap_protect_tte); tlb_context_demap(pm); } else { for (va = sva; va < eva; va += PAGE_SIZE) { if ((tp = tsb_tte_lookup(pm, va)) != NULL) pmap_protect_tte(pm, NULL, tp, va); } tlb_range_demap(pm, sva, eva - 1); } PMAP_UNLOCK(pm); vm_page_unlock_queues(); } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. */ void pmap_enter(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { vm_page_lock_queues(); PMAP_LOCK(pm); pmap_enter_locked(pm, va, m, prot, wired); vm_page_unlock_queues(); PMAP_UNLOCK(pm); } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. * * The page queues and pmap must be locked. */ static void pmap_enter_locked(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { struct tte *tp; vm_paddr_t pa; u_long data; int i; mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_LOCK_ASSERT(pm, MA_OWNED); PMAP_STATS_INC(pmap_nenter); pa = VM_PAGE_TO_PHYS(m); /* * If this is a fake page from the device_pager, but it covers actual * physical memory, convert to the real backing page. */ if ((m->flags & PG_FICTITIOUS) != 0) { for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (pa >= phys_avail[i] && pa <= phys_avail[i + 1]) { m = PHYS_TO_VM_PAGE(pa); break; } } } CTR6(KTR_PMAP, "pmap_enter: ctx=%p m=%p va=%#lx pa=%#lx prot=%#x wired=%d", pm->pm_context[PCPU_GET(cpuid)], m, va, pa, prot, wired); /* * If there is an existing mapping, and the physical address has not * changed, must be protection or wiring change. */ if ((tp = tsb_tte_lookup(pm, va)) != NULL && TTE_GET_PA(tp) == pa) { CTR0(KTR_PMAP, "pmap_enter: update"); PMAP_STATS_INC(pmap_nenter_update); /* * Wiring change, just update stats. */ if (wired) { if ((tp->tte_data & TD_WIRED) == 0) { tp->tte_data |= TD_WIRED; pm->pm_stats.wired_count++; } } else { if ((tp->tte_data & TD_WIRED) != 0) { tp->tte_data &= ~TD_WIRED; pm->pm_stats.wired_count--; } } /* * Save the old bits and clear the ones we're interested in. */ data = tp->tte_data; tp->tte_data &= ~(TD_EXEC | TD_SW | TD_W); /* * If we're turning off write permissions, sense modify status. */ if ((prot & VM_PROT_WRITE) != 0) { tp->tte_data |= TD_SW; if (wired) { tp->tte_data |= TD_W; } vm_page_flag_set(m, PG_WRITEABLE); } else if ((data & TD_W) != 0) { vm_page_dirty(m); } /* * If we're turning on execute permissions, flush the icache. */ if ((prot & VM_PROT_EXECUTE) != 0) { if ((data & TD_EXEC) == 0) { icache_page_inval(pa); } tp->tte_data |= TD_EXEC; } /* * Delete the old mapping. */ tlb_page_demap(pm, TTE_GET_VA(tp)); } else { /* * If there is an existing mapping, but its for a different * phsyical address, delete the old mapping. */ if (tp != NULL) { CTR0(KTR_PMAP, "pmap_enter: replace"); PMAP_STATS_INC(pmap_nenter_replace); pmap_remove_tte(pm, NULL, tp, va); tlb_page_demap(pm, va); } else { CTR0(KTR_PMAP, "pmap_enter: new"); PMAP_STATS_INC(pmap_nenter_new); } /* * Now set up the data and install the new mapping. */ data = TD_V | TD_8K | TD_PA(pa); if (pm == kernel_pmap) data |= TD_P; if ((prot & VM_PROT_WRITE) != 0) { data |= TD_SW; vm_page_flag_set(m, PG_WRITEABLE); } if (prot & VM_PROT_EXECUTE) { data |= TD_EXEC; icache_page_inval(pa); } /* * If its wired update stats. We also don't need reference or * modify tracking for wired mappings, so set the bits now. */ if (wired) { pm->pm_stats.wired_count++; data |= TD_REF | TD_WIRED; if ((prot & VM_PROT_WRITE) != 0) data |= TD_W; } tsb_tte_enter(pm, m, va, TS_8K, data); } } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pm, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; psize = atop(end - start); m = m_start; PMAP_LOCK(pm); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { pmap_enter_locked(pm, start + ptoa(diff), m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), FALSE); m = TAILQ_NEXT(m, listq); } PMAP_UNLOCK(pm); } void pmap_enter_quick(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot) { PMAP_LOCK(pm); pmap_enter_locked(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), FALSE); PMAP_UNLOCK(pm); } void pmap_object_init_pt(pmap_t pm, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT(object->type == OBJT_DEVICE, ("pmap_object_init_pt: non-device object")); } /* * Change the wiring attribute for a map/virtual-address pair. * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap_t pm, vm_offset_t va, boolean_t wired) { struct tte *tp; u_long data; PMAP_LOCK(pm); if ((tp = tsb_tte_lookup(pm, va)) != NULL) { if (wired) { data = atomic_set_long(&tp->tte_data, TD_WIRED); if ((data & TD_WIRED) == 0) pm->pm_stats.wired_count++; } else { data = atomic_clear_long(&tp->tte_data, TD_WIRED); if ((data & TD_WIRED) != 0) pm->pm_stats.wired_count--; } } PMAP_UNLOCK(pm); } static int pmap_copy_tte(pmap_t src_pmap, pmap_t dst_pmap, struct tte *tp, vm_offset_t va) { vm_page_t m; u_long data; if ((tp->tte_data & TD_FAKE) != 0) return (1); if (tsb_tte_lookup(dst_pmap, va) == NULL) { data = tp->tte_data & ~(TD_PV | TD_REF | TD_SW | TD_CV | TD_W); m = PHYS_TO_VM_PAGE(TTE_GET_PA(tp)); tsb_tte_enter(dst_pmap, m, va, TS_8K, data); } return (1); } void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct tte *tp; vm_offset_t va; if (dst_addr != src_addr) return; vm_page_lock_queues(); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } if (len > PMAP_TSB_THRESH) { tsb_foreach(src_pmap, dst_pmap, src_addr, src_addr + len, pmap_copy_tte); tlb_context_demap(dst_pmap); } else { for (va = src_addr; va < src_addr + len; va += PAGE_SIZE) { if ((tp = tsb_tte_lookup(src_pmap, va)) != NULL) pmap_copy_tte(src_pmap, dst_pmap, tp, va); } tlb_range_demap(dst_pmap, src_addr, src_addr + len - 1); } vm_page_unlock_queues(); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } void pmap_zero_page(vm_page_t m) { struct tte *tp; vm_offset_t va; vm_paddr_t pa; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("pmap_zero_page: fake page")); PMAP_STATS_INC(pmap_nzero_page); pa = VM_PAGE_TO_PHYS(m); if (m->md.color == -1) { PMAP_STATS_INC(pmap_nzero_page_nc); aszero(ASI_PHYS_USE_EC, pa, PAGE_SIZE); } else if (m->md.color == DCACHE_COLOR(pa)) { PMAP_STATS_INC(pmap_nzero_page_c); va = TLB_PHYS_TO_DIRECT(pa); cpu_block_zero((void *)va, PAGE_SIZE); } else { PMAP_STATS_INC(pmap_nzero_page_oc); PMAP_LOCK(kernel_pmap); va = pmap_temp_map_1 + (m->md.color * PAGE_SIZE); tp = tsb_kvtotte(va); tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(va, TS_8K); cpu_block_zero((void *)va, PAGE_SIZE); tlb_page_demap(kernel_pmap, va); PMAP_UNLOCK(kernel_pmap); } } void pmap_zero_page_area(vm_page_t m, int off, int size) { struct tte *tp; vm_offset_t va; vm_paddr_t pa; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("pmap_zero_page_area: fake page")); KASSERT(off + size <= PAGE_SIZE, ("pmap_zero_page_area: bad off/size")); PMAP_STATS_INC(pmap_nzero_page_area); pa = VM_PAGE_TO_PHYS(m); if (m->md.color == -1) { PMAP_STATS_INC(pmap_nzero_page_area_nc); aszero(ASI_PHYS_USE_EC, pa + off, size); } else if (m->md.color == DCACHE_COLOR(pa)) { PMAP_STATS_INC(pmap_nzero_page_area_c); va = TLB_PHYS_TO_DIRECT(pa); bzero((void *)(va + off), size); } else { PMAP_STATS_INC(pmap_nzero_page_area_oc); PMAP_LOCK(kernel_pmap); va = pmap_temp_map_1 + (m->md.color * PAGE_SIZE); tp = tsb_kvtotte(va); tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(va, TS_8K); bzero((void *)(va + off), size); tlb_page_demap(kernel_pmap, va); PMAP_UNLOCK(kernel_pmap); } } void pmap_zero_page_idle(vm_page_t m) { struct tte *tp; vm_offset_t va; vm_paddr_t pa; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("pmap_zero_page_idle: fake page")); PMAP_STATS_INC(pmap_nzero_page_idle); pa = VM_PAGE_TO_PHYS(m); if (m->md.color == -1) { PMAP_STATS_INC(pmap_nzero_page_idle_nc); aszero(ASI_PHYS_USE_EC, pa, PAGE_SIZE); } else if (m->md.color == DCACHE_COLOR(pa)) { PMAP_STATS_INC(pmap_nzero_page_idle_c); va = TLB_PHYS_TO_DIRECT(pa); cpu_block_zero((void *)va, PAGE_SIZE); } else { PMAP_STATS_INC(pmap_nzero_page_idle_oc); va = pmap_idle_map + (m->md.color * PAGE_SIZE); tp = tsb_kvtotte(va); tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(va, TS_8K); cpu_block_zero((void *)va, PAGE_SIZE); tlb_page_demap(kernel_pmap, va); } } void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t vdst; vm_offset_t vsrc; vm_paddr_t pdst; vm_paddr_t psrc; struct tte *tp; KASSERT((mdst->flags & PG_FICTITIOUS) == 0, ("pmap_copy_page: fake dst page")); KASSERT((msrc->flags & PG_FICTITIOUS) == 0, ("pmap_copy_page: fake src page")); PMAP_STATS_INC(pmap_ncopy_page); pdst = VM_PAGE_TO_PHYS(mdst); psrc = VM_PAGE_TO_PHYS(msrc); if (msrc->md.color == -1 && mdst->md.color == -1) { PMAP_STATS_INC(pmap_ncopy_page_nc); ascopy(ASI_PHYS_USE_EC, psrc, pdst, PAGE_SIZE); } else if (msrc->md.color == DCACHE_COLOR(psrc) && mdst->md.color == DCACHE_COLOR(pdst)) { PMAP_STATS_INC(pmap_ncopy_page_c); vdst = TLB_PHYS_TO_DIRECT(pdst); vsrc = TLB_PHYS_TO_DIRECT(psrc); cpu_block_copy((void *)vsrc, (void *)vdst, PAGE_SIZE); } else if (msrc->md.color == -1) { if (mdst->md.color == DCACHE_COLOR(pdst)) { PMAP_STATS_INC(pmap_ncopy_page_dc); vdst = TLB_PHYS_TO_DIRECT(pdst); ascopyfrom(ASI_PHYS_USE_EC, psrc, (void *)vdst, PAGE_SIZE); } else { PMAP_STATS_INC(pmap_ncopy_page_doc); PMAP_LOCK(kernel_pmap); vdst = pmap_temp_map_1 + (mdst->md.color * PAGE_SIZE); tp = tsb_kvtotte(vdst); tp->tte_data = TD_V | TD_8K | TD_PA(pdst) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(vdst, TS_8K); ascopyfrom(ASI_PHYS_USE_EC, psrc, (void *)vdst, PAGE_SIZE); tlb_page_demap(kernel_pmap, vdst); PMAP_UNLOCK(kernel_pmap); } } else if (mdst->md.color == -1) { if (msrc->md.color == DCACHE_COLOR(psrc)) { PMAP_STATS_INC(pmap_ncopy_page_sc); vsrc = TLB_PHYS_TO_DIRECT(psrc); ascopyto((void *)vsrc, ASI_PHYS_USE_EC, pdst, PAGE_SIZE); } else { PMAP_STATS_INC(pmap_ncopy_page_soc); PMAP_LOCK(kernel_pmap); vsrc = pmap_temp_map_1 + (msrc->md.color * PAGE_SIZE); tp = tsb_kvtotte(vsrc); tp->tte_data = TD_V | TD_8K | TD_PA(psrc) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(vsrc, TS_8K); ascopyto((void *)vsrc, ASI_PHYS_USE_EC, pdst, PAGE_SIZE); tlb_page_demap(kernel_pmap, vsrc); PMAP_UNLOCK(kernel_pmap); } } else { PMAP_STATS_INC(pmap_ncopy_page_oc); PMAP_LOCK(kernel_pmap); vdst = pmap_temp_map_1 + (mdst->md.color * PAGE_SIZE); tp = tsb_kvtotte(vdst); tp->tte_data = TD_V | TD_8K | TD_PA(pdst) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(vdst, TS_8K); vsrc = pmap_temp_map_2 + (msrc->md.color * PAGE_SIZE); tp = tsb_kvtotte(vsrc); tp->tte_data = TD_V | TD_8K | TD_PA(psrc) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(vsrc, TS_8K); cpu_block_copy((void *)vsrc, (void *)vdst, PAGE_SIZE); tlb_page_demap(kernel_pmap, vdst); tlb_page_demap(kernel_pmap, vsrc); PMAP_UNLOCK(kernel_pmap); } } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pm, vm_page_t m) { struct tte *tp; int loops; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0) return (FALSE); loops = 0; TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { if ((tp->tte_data & TD_PV) == 0) continue; if (TTE_GET_PMAP(tp) == pm) return (TRUE); if (++loops >= 16) break; } return (FALSE); } /* * Remove all pages from specified address space, this aids process exit * speeds. This is much faster than pmap_remove n the case of running down * an entire address space. Only works for the current pmap. */ void pmap_remove_pages(pmap_t pm) { } /* * Returns TRUE if the given page has a managed mapping. */ boolean_t pmap_page_is_mapped(vm_page_t m) { struct tte *tp; if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0) return (FALSE); mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { if ((tp->tte_data & TD_PV) != 0) return (TRUE); } return (FALSE); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { struct tte *tpf; struct tte *tpn; struct tte *tp; u_long data; int count; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0) return (0); count = 0; if ((tp = TAILQ_FIRST(&m->md.tte_list)) != NULL) { tpf = tp; do { tpn = TAILQ_NEXT(tp, tte_link); TAILQ_REMOVE(&m->md.tte_list, tp, tte_link); TAILQ_INSERT_TAIL(&m->md.tte_list, tp, tte_link); if ((tp->tte_data & TD_PV) == 0) continue; data = atomic_clear_long(&tp->tte_data, TD_REF); if ((data & TD_REF) != 0 && ++count > 4) break; } while ((tp = tpn) != NULL && tp != tpf); } return (count); } boolean_t pmap_is_modified(vm_page_t m) { struct tte *tp; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0) return (FALSE); TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { if ((tp->tte_data & TD_PV) == 0) continue; if ((tp->tte_data & TD_W) != 0) return (TRUE); } return (FALSE); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { return (FALSE); } void pmap_clear_modify(vm_page_t m) { struct tte *tp; u_long data; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0) return; TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { if ((tp->tte_data & TD_PV) == 0) continue; data = atomic_clear_long(&tp->tte_data, TD_W); if ((data & TD_W) != 0) tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp)); } } void pmap_clear_reference(vm_page_t m) { struct tte *tp; u_long data; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0) return; TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { if ((tp->tte_data & TD_PV) == 0) continue; data = atomic_clear_long(&tp->tte_data, TD_REF); if ((data & TD_REF) != 0) tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp)); } } void pmap_remove_write(vm_page_t m) { struct tte *tp; u_long data; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0 || (m->flags & PG_WRITEABLE) == 0) return; TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { if ((tp->tte_data & TD_PV) == 0) continue; data = atomic_clear_long(&tp->tte_data, TD_SW | TD_W); if ((data & TD_W) != 0) { vm_page_dirty(m); tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp)); } } vm_page_flag_clear(m, PG_WRITEABLE); } int pmap_mincore(pmap_t pm, vm_offset_t addr) { /* TODO; */ return (0); } /* * Activate a user pmap. The pmap must be activated before its address space * can be accessed in any way. */ void pmap_activate(struct thread *td) { struct vmspace *vm; struct pmap *pm; int context; vm = td->td_proc->p_vmspace; pm = vmspace_pmap(vm); mtx_lock_spin(&sched_lock); context = PCPU_GET(tlb_ctx); if (context == PCPU_GET(tlb_ctx_max)) { tlb_flush_user(); context = PCPU_GET(tlb_ctx_min); } PCPU_SET(tlb_ctx, context + 1); pm->pm_context[PCPU_GET(cpuid)] = context; pm->pm_active |= PCPU_GET(cpumask); PCPU_SET(pmap, pm); stxa(AA_DMMU_TSB, ASI_DMMU, pm->pm_tsb); stxa(AA_IMMU_TSB, ASI_IMMU, pm->pm_tsb); stxa(AA_DMMU_PCXR, ASI_DMMU, context); membar(Sync); mtx_unlock_spin(&sched_lock); } vm_offset_t pmap_addr_hint(vm_object_t object, vm_offset_t va, vm_size_t size) { return (va); } Index: head/sys/sun4v/sun4v/machdep.c =================================================================== --- head/sys/sun4v/sun4v/machdep.c (revision 173360) +++ head/sys/sun4v/sun4v/machdep.c (revision 173361) @@ -1,1001 +1,1001 @@ /*- * Copyright (c) 2001 Jake Burkholder. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 * from: FreeBSD: src/sys/i386/i386/machdep.c,v 1.477 2001/08/27 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_kstack_pages.h" #include "opt_msgbuf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX move this to a header */ extern void mdesc_init(void); typedef int ofw_vec_t(void *); #ifdef DDB extern vm_offset_t ksym_start, ksym_end; #endif struct tlb_entry *kernel_tlbs; int kernel_tlb_slots; int cold = 1; long Maxmem; long realmem; char pcpu0[PCPU_PAGES * PAGE_SIZE]; struct trapframe frame0; int trap_conversion[256]; vm_paddr_t mmu_fault_status_area; vm_offset_t kstack0; vm_paddr_t kstack0_phys; struct kva_md_info kmi; u_long ofw_vec; u_long ofw_tba; /* * Note: timer quality for CPU's is set low to try and prevent them from * being chosen as the primary timecounter. The CPU counters are not * synchronized among the CPU's so in MP machines this causes problems * when calculating the time. With this value the CPU's should only be * chosen as the primary timecounter as a last resort. */ #define UP_TICK_QUALITY 1000 #ifdef SUN4V #define MP_TICK_QUALITY 1000 #else #define MP_TICK_QUALITY -100 #endif static struct timecounter tick_tc; char sparc64_model[32]; cpu_block_copy_t *cpu_block_copy; cpu_block_zero_t *cpu_block_zero; static timecounter_get_t tick_get_timecount; void sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec); void sparc64_shutdown_final(void *dummy, int howto); static void cpu_startup(void *); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); CTASSERT((1 << INT_SHIFT) == sizeof(int)); CTASSERT((1 << PTR_SHIFT) == sizeof(char *)); CTASSERT(sizeof(struct reg) == 256); CTASSERT(sizeof(struct fpreg) == 272); CTASSERT(sizeof(struct __mcontext) == 512); CTASSERT((sizeof(struct pcb) & (64 - 1)) == 0); CTASSERT((offsetof(struct pcb, pcb_kfp) & (64 - 1)) == 0); CTASSERT((offsetof(struct pcb, pcb_ufp) & (64 - 1)) == 0); CTASSERT(sizeof(struct pcb) <= ((KSTACK_PAGES * PAGE_SIZE) / 8)); CTASSERT(sizeof(struct pcpu) <= ((PCPU_PAGES * PAGE_SIZE) / 2)); CTASSERT((sizeof(struct pcpu) & ((1<<6)-1)) == 0); #define BVPRINTF(x) \ if (bootverbose) \ printf(x); static void cpu_startup(void *arg) { vm_paddr_t physsz; int i; tick_tc.tc_get_timecount = tick_get_timecount; tick_tc.tc_poll_pps = NULL; tick_tc.tc_counter_mask = ~0u; tick_tc.tc_frequency = tick_freq; tick_tc.tc_name = "tick"; tick_tc.tc_quality = UP_TICK_QUALITY; #ifdef SMP /* * We do not know if each CPU's tick counter is synchronized. */ if (cpu_mp_probe()) tick_tc.tc_quality = MP_TICK_QUALITY; #endif tc_init(&tick_tc); physsz = 0; for (i = 0; i < sparc64_nmemreg; i++) physsz += sparc64_memreg[i].mr_size; printf("real memory = %lu (%lu MB)\n", physsz, physsz / (1024 * 1024)); realmem = (long)physsz; vm_ksubmap_init(&kmi); bufinit(); vm_pager_bufferinit(); EVENTHANDLER_REGISTER(shutdown_final, sparc64_shutdown_final, NULL, SHUTDOWN_PRI_LAST); printf("avail memory = %lu (%lu MB)\n", cnt.v_free_count * PAGE_SIZE, cnt.v_free_count / ((1024 * 1024) / PAGE_SIZE)); if (bootverbose) printf("machine: %s\n", sparc64_model); #ifdef notyet cpu_identify(rdpr(ver), tick_freq, PCPU_GET(cpuid)); #endif } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { struct intr_request *ir; int i; pcpu->pc_irtail = &pcpu->pc_irhead; for (i = 0; i < IR_FREE; i++) { ir = &pcpu->pc_irpool[i]; ir->ir_next = pcpu->pc_irfree; pcpu->pc_irfree = ir; } } void spinlock_enter(void) { struct thread *td; register_t pil; td = curthread; if (td->td_md.md_spinlock_count == 0) { pil = intr_disable(); td->td_md.md_saved_pil = pil; } td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; td = curthread; critical_exit(); td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) { intr_restore(td->td_md.md_saved_pil); } } unsigned tick_get_timecount(struct timecounter *tc) { return ((unsigned)rd(tick)); } void sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec) { phandle_t child; phandle_t root; struct pcpu *pc; vm_offset_t end; caddr_t kmdp; u_int clock; char *env; char type[8]; vm_paddr_t mmfsa; int i; end = 0; kmdp = NULL; /* * Initialize Open Firmware (needed for console). */ OF_init(vec); /* * XXX */ bootverbose = 1; /* * Parse metadata if present and fetch parameters. Must be before the * console is inited so cninit gets the right value of boothowto. */ if (mdp != NULL) { preload_metadata = mdp; kmdp = preload_search_by_type("elf kernel"); if (kmdp != NULL) { boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t); kernel_tlb_slots = MD_FETCH(kmdp, MODINFOMD_DTLB_SLOTS, int); kernel_tlbs = (void *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_DTLB); } } if (boothowto & RB_VERBOSE) bootverbose = 1; init_param1(); root = OF_peer(0); for (child = OF_child(root); child != 0; child = OF_peer(child)) { OF_getprop(child, "device_type", type, sizeof(type)); if (strcmp(type, "cpu") == 0) break; } OF_getprop(child, "clock-frequency", &clock, sizeof(clock)); /* * Initialize the console before printing anything. * console uses the pcpu area for serialization */ pc = (struct pcpu *)(pcpu0 + (PCPU_PAGES * PAGE_SIZE)) - 1; cpu_setregs(pc); /* * Initialize proc0 stuff (p_contested needs to be done early). */ - proc_linkup(&proc0, &thread0); + proc_linkup0(&proc0, &thread0); proc0.p_md.md_sigtramp = NULL; proc0.p_md.md_utrap = NULL; frame0.tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_PRIV; thread0.td_frame = &frame0; if ((u_long)thread0.td_frame & 0x3f) { panic("unaligned frame0"); } /* * Prime our per-cpu data page for use. Note, we are using it for our * stack, so don't pass the real size (PAGE_SIZE) to pcpu_init or * it'll zero it out from under us. */ pc = (struct pcpu *)(pcpu0 + (PCPU_PAGES * PAGE_SIZE)) - 1; pcpu_init(pc, 0, sizeof(struct pcpu)); pc->pc_curthread = &thread0; pc->pc_addr = (vm_offset_t)pcpu0; cninit(); tick_init(clock); printf("cpu0: UltraSparc T1 Processor (%d.%02d MHz CPU)\n", (clock + 4999) / 1000000, ((clock + 4999) / 10000) % 100); /* * Panic is there is no metadata. Most likely the kernel was booted * directly, instead of through loader(8). */ if (mdp == NULL || kmdp == NULL) { printf("sparc64_init: no loader metadata.\n" "This probably means you are not using loader(8).\n"); panic("sparc64_init"); } /* * Sanity check the kernel end, which is important. */ if (end == 0) { printf("sparc64_init: warning, kernel end not specified.\n" "Attempting to continue anyway.\n"); end = (vm_offset_t)_end; } cpu_block_copy = bcopy; cpu_block_zero = bzero; #ifdef SMP mp_tramp = mp_tramp_alloc(); #endif env = getenv("kernelname"); if (env != NULL) { strlcpy(kernelname, env, sizeof(kernelname)); freeenv(env); } /* * Initialize global registers. * needed for curthread to work */ cpu_setregs(pc); /* * Initialize virtual memory and calculate physmem. */ pmap_bootstrap(end); thread0.td_kstack = kstack0; thread0.td_md.md_saved_pil = 0; thread0.td_pcb = (struct pcb *) (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; thread0.td_pcb->pcb_kstack = (uint64_t)(((char *)thread0.td_pcb) - (CCFSZ + SPOFF)); thread0.td_pcb = (struct pcb *)TLB_PHYS_TO_DIRECT(vtophys((vm_offset_t)thread0.td_pcb)); pc->pc_curpcb = thread0.td_pcb; if (((thread0.td_pcb->pcb_kstack + SPOFF) & 0x3f) != 0) { printf("unaligned stack pcb_kstack & 0x3f == 0x%lx\n", ((thread0.td_pcb->pcb_kstack + SPOFF) & 0x3f)); } /* * Update PCPU_REG to point to direct address * to support easy phys <-> virt translation in trap handler */ pc = (struct pcpu *)TLB_PHYS_TO_DIRECT(vtophys(pc)); BVPRINTF("initializing cpu regs\n"); cpu_setregs(pc); /* * Initialize tunables. */ BVPRINTF("initialize tunables\n"); init_param2(physmem); /* * setup trap table and fault status area */ BVPRINTF("initialize trap tables\n"); mmfsa = mmu_fault_status_area + MMFSA_SIZE; BVPRINTF("setwstate\n"); set_wstate(WSTATE_KERN); BVPRINTF("set_mmfsa_scratchpad\n"); set_mmfsa_scratchpad(mmfsa); BVPRINTF("init_mondo_queue\n"); init_mondo_queue(); BVPRINTF("set_mmfsa_traptable\n"); set_mmfsa_traptable(&tl0_base, mmfsa); BVPRINTF("trap conversion\n"); for (i = 0; i < 256; i++) trap_conversion[i] = 0; trap_conversion[TT_INSTRUCTION_EXCEPTION] = T_INSTRUCTION_EXCEPTION; trap_conversion[TT_INSTRUCTION_MISS] = T_INSTRUCTION_MISS; trap_conversion[TT_ILLEGAL_INSTRUCTION] = T_ILLEGAL_INSTRUCTION; trap_conversion[TT_PRIVILEGED_OPCODE] = T_PRIVILEGED_OPCODE; trap_conversion[TT_FP_EXCEPTION_IEEE_754] = T_FP_EXCEPTION_IEEE_754; trap_conversion[TT_TAG_OVERFLOW] = T_TAG_OVERFLOW; trap_conversion[TT_DIVISION_BY_ZERO] = T_DIVISION_BY_ZERO; trap_conversion[TT_DATA_EXCEPTION] = T_DATA_EXCEPTION; trap_conversion[TT_DATA_MISS] = T_DATA_MISS; trap_conversion[TT_ALIGNMENT] = T_ALIGNMENT; trap_conversion[TT_DATA_PROTECTION] = T_DATA_PROTECTION; /* * Initialize the message buffer (after setting trap table). */ BVPRINTF("initialize msgbuf\n"); msgbufinit(msgbufp, MSGBUF_SIZE); BVPRINTF("initialize mutexes\n"); mutex_init(); BVPRINTF("initialize machine descriptor table\n"); mdesc_init(); BVPRINTF("initialize get model name\n"); OF_getprop(root, "name", sparc64_model, sizeof(sparc64_model) - 1); BVPRINTF("initialize kdb\n"); kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter("Boot flags requested debugger"); #endif BVPRINTF("sparc64_init done\n"); } void set_openfirm_callback(ofw_vec_t *vec) { ofw_tba = rdpr(tba); ofw_vec = (u_long)vec; } void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct trapframe *tf; struct sigframe *sfp; struct sigacts *psp; struct sigframe sf; struct thread *td; struct frame *fp; struct proc *p; int oonstack; u_long sp; int sig; int code; oonstack = 0; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; code = ksi->ksi_code; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); tf = td->td_frame; sp = tf->tf_sp + SPOFF; oonstack = sigonstack(sp); CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm, catcher, sig); /* Make sure we have a signal trampoline to return to. */ if (p->p_md.md_sigtramp == NULL) { /* * No signal tramoline... kill the process. */ CTR0(KTR_SIG, "sendsig: no sigtramp"); printf("sendsig: %s is too old, rebuild it\n", p->p_comm); sigexit(td, sig); /* NOTREACHED */ } /* Save user context. */ bzero(&sf, sizeof(sf)); get_mcontext(td, &sf.sf_uc.uc_mcontext, 0); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; /* Allocate and validate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct sigframe)); } else sfp = (struct sigframe *)sp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); fp = (struct frame *)sfp - 1; /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ tf->tf_out[0] = sig; tf->tf_out[2] = (register_t)&sfp->sf_uc; tf->tf_out[4] = (register_t)catcher; /* Fill siginfo structure. */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_addr = (void *)tf->tf_tpc; if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ tf->tf_out[1] = (register_t)&sfp->sf_si; /* Fill in POSIX parts. */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ } else { /* Old FreeBSD-style arguments. */ tf->tf_out[1] = ksi->ksi_code; tf->tf_out[3] = (register_t)ksi->ksi_addr; } /* Copy the sigframe out to the user's stack. */ if (rwindow_save(td) != 0 || copyout(&sf, sfp, sizeof(*sfp)) != 0 || suword(&fp->fr_in[6], tf->tf_out[6]) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp); PROC_LOCK(p); sigexit(td, SIGILL); /* NOTREACHED */ } tf->tf_tpc = (u_long)p->p_md.md_sigtramp; tf->tf_tnpc = tf->tf_tpc + 4; tf->tf_sp = (u_long)fp - SPOFF; CTR3(KTR_SIG, "sendsig: return td=%p pc=%#lx sp=%#lx", td, tf->tf_tpc, tf->tf_sp); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #ifndef _SYS_SYSPROTO_H_ struct sigreturn_args { ucontext_t *ucp; }; #endif /* * MPSAFE */ int sigreturn(struct thread *td, struct sigreturn_args *uap) { struct proc *p; mcontext_t *mc; ucontext_t uc; int error; p = td->td_proc; if (rwindow_save(td)) { PROC_LOCK(p); sigexit(td, SIGILL); } CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp); if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) { CTR1(KTR_SIG, "sigreturn: efault td=%p", td); return (EFAULT); } mc = &uc.uc_mcontext; error = set_mcontext(td, mc); if (error != 0) return (error); PROC_LOCK(p); td->td_sigmask = uc.uc_sigmask; SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); CTR4(KTR_SIG, "sigreturn: return td=%p pc=%#lx sp=%#lx tstate=%#lx", td, mc->mc_tpc, mc->mc_sp, mc->mc_tstate); return (EJUSTRETURN); } #ifdef COMPAT_FREEBSD4 int freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) { return sigreturn(td, (struct sigreturn_args *)uap); } #endif /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_pc = tf->tf_tpc; pcb->pcb_sp = tf->tf_sp; } int get_mcontext(struct thread *td, mcontext_t *mc, int flags) { struct trapframe *tf; struct pcb *pcb; tf = td->td_frame; pcb = td->td_pcb; bcopy(tf, mc, sizeof(*tf)); if (flags & GET_MC_CLEAR_RET) { mc->mc_out[0] = 0; mc->mc_out[1] = 0; } mc->mc_flags = _MC_VERSION; critical_enter(); if ((tf->tf_fprs & FPRS_FEF) != 0) { savefpctx(pcb->pcb_ufp); pcb->pcb_flags |= PCB_FEF; tf->tf_fprs &= ~FPRS_FEF; } if ((pcb->pcb_flags & PCB_FEF) != 0) { bcopy(pcb->pcb_ufp, mc->mc_fp, sizeof(mc->mc_fp)); mc->mc_fprs |= FPRS_FEF; } critical_exit(); return (0); } int set_mcontext(struct thread *td, const mcontext_t *mc) { struct trapframe *tf; struct pcb *pcb; uint64_t wstate; if (!TSTATE_SECURE(mc->mc_tstate) || (mc->mc_flags & ((1L << _MC_VERSION_BITS) - 1)) != _MC_VERSION) return (EINVAL); tf = td->td_frame; pcb = td->td_pcb; /* Make sure the windows are spilled first. */ flushw(); wstate = tf->tf_wstate; bcopy(mc, tf, sizeof(*tf)); tf->tf_wstate = wstate; if ((mc->mc_fprs & FPRS_FEF) != 0) { tf->tf_fprs = 0; bcopy(mc->mc_fp, pcb->pcb_ufp, sizeof(pcb->pcb_ufp)); pcb->pcb_flags |= PCB_FEF; } return (0); } /* * Exit the kernel and execute a firmware call that will not return, as * specified by the arguments. */ void cpu_shutdown(void *args) { #ifdef SMP cpu_mp_shutdown(); #endif hv_mach_exit(0); } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { return (ENXIO); } /* * Duplicate OF_exit() with a different firmware call function that restores * the trap table, otherwise a RED state exception is triggered in at least * some firmware versions. */ void cpu_halt(void) { static struct { cell_t name; cell_t nargs; cell_t nreturns; } args = { (cell_t)"exit", 0, 0 }; cpu_shutdown(&args); } void sparc64_shutdown_final(void *dummy, int howto) { static struct { cell_t name; cell_t nargs; cell_t nreturns; } args = { (cell_t)"SUNW,power-off", 0, 0 }; /* Turn the power off? */ if ((howto & RB_POWEROFF) != 0) cpu_shutdown(&args); /* In case of halt, return to the firmware */ if ((howto & RB_HALT) != 0) cpu_halt(); } void cpu_idle(void) { if (rdpr(pil) != 0) panic("pil in cpu_idle not 0 - %ld", rdpr(pil)); if (rdpr(pstate) != 0x16) panic("interrupts disabled in cpu_idle 0x%lx", rdpr(pstate)); /* XXX heinous hack begin*/ cpu_yield(); } int ptrace_set_pc(struct thread *td, u_long addr) { td->td_frame->tf_tpc = addr; td->td_frame->tf_tnpc = addr + 4; return (0); } int ptrace_single_step(struct thread *td) { /* TODO; */ return (0); } int ptrace_clear_single_step(struct thread *td) { /* TODO; */ return (0); } void exec_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings) { struct trapframe *tf; struct pcb *pcb; struct proc *p; uint64_t kstack; u_long sp; /* XXX no cpu_exec */ p = td->td_proc; p->p_md.md_sigtramp = NULL; if (p->p_md.md_utrap != NULL) { utrap_free(p->p_md.md_utrap); p->p_md.md_utrap = NULL; } pcb = td->td_pcb; kstack = pcb->pcb_kstack; tf = td->td_frame; sp = rounddown(stack, 16); bzero(pcb, sizeof(*pcb)); bzero(tf, sizeof(*tf)); pcb->pcb_kstack = kstack; tf->tf_out[0] = stack; tf->tf_out[3] = p->p_sysent->sv_psstrings; tf->tf_out[6] = sp - SPOFF - sizeof(struct frame); tf->tf_tnpc = entry + 4; tf->tf_tpc = entry; tf->tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_MM_TSO; td->td_retval[0] = tf->tf_out[0]; td->td_retval[1] = tf->tf_out[1]; } int fill_regs(struct thread *td, struct reg *regs) { bcopy(td->td_frame, regs, sizeof(*regs)); return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *tf; if (!TSTATE_SECURE(regs->r_tstate)) return (EINVAL); tf = td->td_frame; regs->r_wstate = tf->tf_wstate; bcopy(regs, tf, sizeof(*regs)); return (0); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { return (ENOSYS); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { return (ENOSYS); } int fill_fpregs(struct thread *td, struct fpreg *fpregs) { struct trapframe *tf; struct pcb *pcb; pcb = td->td_pcb; tf = td->td_frame; tf->tf_fprs = ~FPRS_FEF; bcopy(pcb->pcb_ufp, fpregs->fr_regs, sizeof(fpregs->fr_regs)); fpregs->fr_fsr = tf->tf_fsr; fpregs->fr_gsr = tf->tf_gsr; return (0); } int set_fpregs(struct thread *td, struct fpreg *fpregs) { struct trapframe *tf; struct pcb *pcb; pcb = td->td_pcb; tf = td->td_frame; tf->tf_fprs &= ~FPRS_FEF; bcopy(fpregs->fr_regs, pcb->pcb_ufp, sizeof(pcb->pcb_ufp)); tf->tf_fsr = fpregs->fr_fsr; tf->tf_gsr = fpregs->fr_gsr; return (0); } struct md_utrap * utrap_alloc(void) { struct md_utrap *ut; ut = malloc(sizeof(struct md_utrap), M_SUBPROC, M_WAITOK | M_ZERO); ut->ut_refcnt = 1; return (ut); } void utrap_free(struct md_utrap *ut) { int refcnt; if (ut == NULL) return; mtx_pool_lock(mtxpool_sleep, ut); ut->ut_refcnt--; refcnt = ut->ut_refcnt; mtx_pool_unlock(mtxpool_sleep, ut); if (refcnt == 0) free(ut, M_SUBPROC); } struct md_utrap * utrap_hold(struct md_utrap *ut) { if (ut == NULL) return (NULL); mtx_pool_lock(mtxpool_sleep, ut); ut->ut_refcnt++; mtx_pool_unlock(mtxpool_sleep, ut); return (ut); } void cpu_yield(void) { if (rdpr(pil) < PIL_TICK) hv_cpu_yield(); } Index: head/sys/sun4v/sun4v/pmap.c =================================================================== --- head/sys/sun4v/sun4v/pmap.c (revision 173360) +++ head/sys/sun4v/sun4v/pmap.c (revision 173361) @@ -1,2233 +1,2234 @@ /*- * Copyright (c) 2006 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_kstack_pages.h" #include "opt_msgbuf.h" #include "opt_pmap.h" #include "opt_trap_trace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TRAP_TRACING void trap_trace_report(int); #endif #if 1 #define PMAP_DEBUG #endif #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif /* * Virtual and physical address of message buffer. */ struct msgbuf *msgbufp; vm_paddr_t msgbuf_phys; /* * Map of physical memory reagions. */ vm_paddr_t phys_avail[128]; vm_paddr_t phys_avail_tmp[128]; static struct ofw_mem_region mra[128]; static struct ofw_map translations[128]; static int translations_size; struct ofw_mem_region sparc64_memreg[128]; int sparc64_nmemreg; extern vm_paddr_t mmu_fault_status_area; /* * First and last available kernel virtual addresses. */ vm_offset_t virtual_avail; vm_offset_t virtual_end; vm_offset_t kernel_vm_end; vm_offset_t vm_max_kernel_address; #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif /* * Data for the pv entry allocation mechanism */ static uma_zone_t pvzone; static struct vm_object pvzone_obj; static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; int pmap_debug = 0; static int pmap_debug_range = 1; static int use_256M_pages = 1; static struct mtx pmap_ctx_lock; static uint16_t ctx_stack[PMAP_CONTEXT_MAX]; static int ctx_stack_top; static int permanent_mappings = 0; static uint64_t nucleus_memory; static uint64_t nucleus_mappings[4]; /* * Kernel pmap. */ struct pmap kernel_pmap_store; hv_tsb_info_t kernel_td[MAX_TSB_INFO]; /* * This should be determined at boot time * with tiny TLBS it doesn't make sense to try and selectively * invalidate more than this */ #define MAX_INVALIDATES 32 #define MAX_TSB_CLEARS 128 /* * Allocate physical memory for use in pmap_bootstrap. */ static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size); /* * If user pmap is processed with pmap_remove and with pmap_remove and the * resident count drops to 0, there are no more pages to remove, so we * need not continue. */ #define PMAP_REMOVE_DONE(pm) \ ((pm) != kernel_pmap && (pm)->pm_stats.resident_count == 0) /* * Kernel MMU interface */ #define curthread_pmap vmspace_pmap(curthread->td_proc->p_vmspace) #ifdef PMAP_DEBUG #define KDPRINTF if (pmap_debug) printf #define DPRINTF \ if (curthread_pmap && (curthread_pmap->pm_context != 0) && ((PCPU_GET(cpumask) & curthread_pmap->pm_active) == 0)) \ panic("cpumask(0x%x) & active (0x%x) == 0 pid == %d\n", \ PCPU_GET(cpumask), curthread_pmap->pm_active, curthread->td_proc->p_pid); \ if (pmap_debug) printf #else #define DPRINTF(...) #define KDPRINTF(...) #endif static void free_pv_entry(pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t locked_pmap); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_remove_tte(pmap_t pmap, tte_t tte_data, vm_offset_t va); static void pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot); static void pmap_tsb_reset(pmap_t pmap); static void pmap_tsb_resize(pmap_t pmap); static void pmap_tte_hash_resize(pmap_t pmap); void pmap_set_ctx_panic(uint64_t error, vm_paddr_t tsb_ra, pmap_t pmap); struct tsb_resize_info { uint64_t tri_tsbscratch; uint64_t tri_tsb_ra; }; /* * Quick sort callout for comparing memory regions. */ static int mr_cmp(const void *a, const void *b); static int om_cmp(const void *a, const void *b); static int mr_cmp(const void *a, const void *b) { const struct ofw_mem_region *mra; const struct ofw_mem_region *mrb; mra = a; mrb = b; if (mra->mr_start < mrb->mr_start) return (-1); else if (mra->mr_start > mrb->mr_start) return (1); else return (0); } static int om_cmp(const void *a, const void *b) { const struct ofw_map *oma; const struct ofw_map *omb; oma = a; omb = b; if (oma->om_start < omb->om_start) return (-1); else if (oma->om_start > omb->om_start) return (1); else return (0); } static __inline void free_context(uint16_t ctx) { mtx_lock_spin(&pmap_ctx_lock); ctx_stack[ctx_stack_top++] = ctx; mtx_unlock_spin(&pmap_ctx_lock); KASSERT(ctx_stack_top < PMAP_CONTEXT_MAX, ("context stack overrun - system error")); } static __inline uint16_t get_context(void) { uint16_t ctx; mtx_lock_spin(&pmap_ctx_lock); ctx = ctx_stack[--ctx_stack_top]; mtx_unlock_spin(&pmap_ctx_lock); KASSERT(ctx_stack_top > 0, ("context stack underrun - need to implement context stealing")); return ctx; } static __inline void free_pv_entry(pv_entry_t pv) { pv_entry_count--; uma_zfree(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t locked_pmap) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; struct vpgqueues *vpq; uint64_t tte_data; pmap_t pmap; pv_entry_t allocated_pv, next_pv, pv; vm_offset_t va; vm_page_t m; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); allocated_pv = uma_zalloc(pvzone, M_NOWAIT); if (allocated_pv != NULL) { pv_entry_count++; if (pv_entry_count > pv_entry_high_water) pagedaemon_wakeup(); else return (allocated_pv); } /* * Reclaim pv entries: At first, destroy mappings to inactive * pages. After that, if a pv entry is still needed, destroy * mappings to active pages. */ if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, " "increase the vm.pmap.shpgperproc tunable.\n"); vpq = &vm_page_queues[PQ_INACTIVE]; retry: TAILQ_FOREACH(m, &vpq->pl, pageq) { if (m->hold_count || m->busy) continue; TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) { va = pv->pv_va; pmap = pv->pv_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) continue; pmap->pm_stats.resident_count--; tte_data = tte_hash_delete(pmap->pm_hash, va); KASSERT((tte_data & VTD_WIRED) == 0, ("get_pv_entry: wired pte %#jx", (uintmax_t)tte_data)); if (tte_data & VTD_REF) vm_page_flag_set(m, PG_REFERENCED); if (tte_data & VTD_W) { KASSERT((tte_data & VTD_SW_W), ("get_pv_entry: modified page not writable: va: %lx, tte: %lx", va, tte_data)); vm_page_dirty(m); } pmap_invalidate_page(pmap, va, TRUE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); m->md.pv_list_count--; if (pmap != locked_pmap) PMAP_UNLOCK(pmap); if (allocated_pv == NULL) allocated_pv = pv; else free_pv_entry(pv); } } if (allocated_pv == NULL) { if (vpq == &vm_page_queues[PQ_INACTIVE]) { vpq = &vm_page_queues[PQ_ACTIVE]; goto retry; } panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable"); } return (allocated_pv); } /* * Allocate a physical page of memory directly from the phys_avail map. * Can only be called from pmap_bootstrap before avail start and end are * calculated. */ static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size) { vm_paddr_t pa; int i; size = round_page(size); for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (phys_avail[i + 1] - phys_avail[i] < size) continue; pa = phys_avail[i]; phys_avail[i] += size; pmap_scrub_pages(pa, size); return (pa); } panic("pmap_bootstrap_alloc"); } /* * Activate a user pmap. The pmap must be activated before its address space * can be accessed in any way. */ void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; int err; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); #if defined(SMP) atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); atomic_set_int(&pmap->pm_tlbactive, PCPU_GET(cpumask)); atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); #else oldpmap->pm_active &= ~1; pmap->pm_active |= 1; pmap->pm_tlbactive |= 1; #endif pmap->pm_hashscratch = tte_hash_set_scratchpad_user(pmap->pm_hash, pmap->pm_context); pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb); pmap->pm_tsb_miss_count = pmap->pm_tsb_cap_miss_count = 0; PCPU_SET(curpmap, pmap); if (pmap->pm_context != 0) if ((err = hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra)) != H_EOK) panic("failed to set TSB 0x%lx - context == %ld\n", pmap->pm_tsb_ra, pmap->pm_context); stxa(MMU_CID_S, ASI_MMU_CONTEXTID, pmap->pm_context); membar(Sync); critical_exit(); } vm_offset_t pmap_addr_hint(vm_object_t object, vm_offset_t va, vm_size_t size) { return (va); } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t ekva) { struct pmap *pm; vm_offset_t off, va; vm_paddr_t pa, tsb_8k_pa, tsb_4m_pa, kernel_hash_pa, nucleus_memory_start; vm_size_t physsz, virtsz, kernel_hash_shift; ihandle_t pmem, vmem; int i, j, k, sz; uint64_t tsb_8k_size, tsb_4m_size, error, physmem_tunable, physmemstart_tunable; vm_paddr_t real_phys_avail[128], tmp_phys_avail[128], bounds; if ((vmem = OF_finddevice("/virtual-memory")) == -1) panic("pmap_bootstrap: finddevice /virtual-memory"); if ((sz = OF_getproplen(vmem, "translations")) == -1) panic("pmap_bootstrap: getproplen translations"); if (sizeof(translations) < sz) panic("pmap_bootstrap: translations too small"); bzero(translations, sz); if (OF_getprop(vmem, "translations", translations, sz) == -1) panic("pmap_bootstrap: getprop /virtual-memory/translations"); sz /= sizeof(*translations); translations_size = sz; nucleus_memory_start = 0; CTR0(KTR_PMAP, "pmap_bootstrap: translations"); qsort(translations, sz, sizeof (*translations), om_cmp); for (i = 0; i < sz; i++) { KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n", translations[i].om_size, translations[i].om_start, translations[i].om_tte); if ((translations[i].om_start >= KERNBASE) && (translations[i].om_start <= KERNBASE + 3*PAGE_SIZE_4M)) { for (j = 0; j < translations[i].om_size; j += PAGE_SIZE_4M) { KDPRINTF("mapping permanent translation\n"); pa = TTE_GET_PA(translations[i].om_tte) + j; va = translations[i].om_start + j; error = hv_mmu_map_perm_addr(va, KCONTEXT, pa | TTE_KERNEL | VTD_4M, MAP_ITLB | MAP_DTLB); if (error != H_EOK) panic("map_perm_addr returned error=%ld", error); if ((nucleus_memory_start == 0) || (pa < nucleus_memory_start)) nucleus_memory_start = pa; printf("nucleus_mappings[%d] = 0x%lx\n", permanent_mappings, pa); nucleus_mappings[permanent_mappings++] = pa; nucleus_memory += PAGE_SIZE_4M; #ifdef SMP mp_add_nucleus_mapping(va, pa|TTE_KERNEL|VTD_4M); #endif } } } /* * Find out what physical memory is available from the prom and * initialize the phys_avail array. This must be done before * pmap_bootstrap_alloc is called. */ if ((pmem = OF_finddevice("/memory")) == -1) panic("pmap_bootstrap: finddevice /memory"); if ((sz = OF_getproplen(pmem, "available")) == -1) panic("pmap_bootstrap: getproplen /memory/available"); if (sizeof(vm_paddr_t)*128 < sz) /* FIXME */ panic("pmap_bootstrap: phys_avail too small"); if (sizeof(mra) < sz) panic("pmap_bootstrap: mra too small"); bzero(mra, sz); if (OF_getprop(pmem, "available", mra, sz) == -1) panic("pmap_bootstrap: getprop /memory/available"); sz /= sizeof(*mra); CTR0(KTR_PMAP, "pmap_bootstrap: physical memory"); qsort(mra, sz, sizeof (*mra), mr_cmp); physmemstart_tunable = physmem_tunable = physmem = physsz = 0; if (TUNABLE_ULONG_FETCH("hw.physmemstart", &physmemstart_tunable)) { KDPRINTF("desired physmemstart=0x%lx\n", physmemstart_tunable); } if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) { physmem = atop(physmem_tunable); KDPRINTF("desired physmem=0x%lx\n", physmem_tunable); } if ((physmem_tunable != 0) && (physmemstart_tunable != 0)) physmem_tunable += physmemstart_tunable; bzero(real_phys_avail, sizeof(real_phys_avail)); bzero(tmp_phys_avail, sizeof(tmp_phys_avail)); for (i = 0, j = 0; i < sz; i++) { uint64_t size; KDPRINTF("start=%#lx size=%#lx\n", mra[i].mr_start, mra[i].mr_size); if (mra[i].mr_size < PAGE_SIZE_4M) continue; if ((mra[i].mr_start & PAGE_MASK_4M) || (mra[i].mr_size & PAGE_MASK_4M)) { uint64_t newstart, roundup; newstart = ((mra[i].mr_start + (PAGE_MASK_4M)) & ~PAGE_MASK_4M); roundup = newstart - mra[i].mr_start; size = (mra[i].mr_size - roundup) & ~PAGE_MASK_4M; mra[i].mr_start = newstart; if (size < PAGE_SIZE_4M) continue; mra[i].mr_size = size; } real_phys_avail[j] = mra[i].mr_start; if (physmem_tunable != 0 && ((physsz + mra[i].mr_size) >= physmem_tunable)) { mra[i].mr_size = physmem_tunable - physsz; physsz = physmem_tunable; real_phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size; break; } physsz += mra[i].mr_size; real_phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size; j += 2; } physmem = btoc(physsz - physmemstart_tunable); /* * This is needed for versions of OFW that would allocate us memory * and then forget to remove it from the available ranges ... * as well as for compensating for the above move of nucleus pages */ for (i = 0, j = 0, bounds = (1UL<<32); real_phys_avail[i] != 0; i += 2) { vm_paddr_t start = real_phys_avail[i]; uint64_t end = real_phys_avail[i + 1]; CTR2(KTR_PMAP, "start=%#lx size=%#lx\n", start, end); KDPRINTF("real_phys start=%#lx end=%#lx\n", start, end); /* * Is kernel memory at the beginning of range? */ if (nucleus_memory_start == start) { start += nucleus_memory; } /* * Is kernel memory at the end of range? */ if (nucleus_memory_start == (end - nucleus_memory)) end -= nucleus_memory; if (physmemstart_tunable != 0 && (end < physmemstart_tunable)) continue; if (physmemstart_tunable != 0 && ((start < physmemstart_tunable))) { start = physmemstart_tunable; } /* * Is kernel memory in the middle somewhere? */ if ((nucleus_memory_start > start) && (nucleus_memory_start < end)) { phys_avail[j] = start; phys_avail[j+1] = nucleus_memory_start; start = nucleus_memory_start + nucleus_memory; j += 2; } /* * Break phys_avail up on 4GB boundaries to try * to work around PCI-e allocation bug * we rely on the fact that kernel memory is allocated * from the first 4GB of physical memory */ while (bounds < start) bounds += (1UL<<32); while (bounds < end) { phys_avail[j] = start; phys_avail[j + 1] = bounds; start = bounds; bounds += (1UL<<32); j += 2; } phys_avail[j] = start; phys_avail[j + 1] = end; j += 2; } /* * Merge nucleus memory in to real_phys_avail * */ for (i = 0; real_phys_avail[i] != 0; i += 2) { if (real_phys_avail[i] == nucleus_memory_start + nucleus_memory) real_phys_avail[i] -= nucleus_memory; if (real_phys_avail[i + 1] == nucleus_memory_start) real_phys_avail[i + 1] += nucleus_memory; if (real_phys_avail[i + 1] == real_phys_avail[i + 2]) { real_phys_avail[i + 1] = real_phys_avail[i + 3]; for (k = i + 2; real_phys_avail[k] != 0; k += 2) { real_phys_avail[k] = real_phys_avail[k + 2]; real_phys_avail[k + 1] = real_phys_avail[k + 3]; } } } for (i = 0; phys_avail[i] != 0; i += 2) if (pmap_debug_range || pmap_debug) printf("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n", i, phys_avail[i], i+1, phys_avail[i+1]); /* * Shuffle the memory range containing the 256MB page with * nucleus_memory to the beginning of the phys_avail array * so that physical memory from that page is preferentially * allocated first */ for (j = 0; phys_avail[j] != 0; j += 2) if (nucleus_memory_start < phys_avail[j]) break; /* * Don't shuffle unless we have a full 256M page in the range * our kernel malloc appears to be horribly brittle */ if ((phys_avail[j + 1] - phys_avail[j]) < (PAGE_SIZE_256M - nucleus_memory)) goto skipshuffle; for (i = j, k = 0; phys_avail[i] != 0; k++, i++) tmp_phys_avail[k] = phys_avail[i]; for (i = 0; i < j; i++) tmp_phys_avail[k + i] = phys_avail[i]; for (i = 0; i < 128; i++) phys_avail[i] = tmp_phys_avail[i]; skipshuffle: for (i = 0; real_phys_avail[i] != 0; i += 2) if (pmap_debug_range || pmap_debug) printf("real_phys_avail[%d]=0x%lx real_phys_avail[%d]=0x%lx\n", i, real_phys_avail[i], i+1, real_phys_avail[i+1]); for (i = 0; phys_avail[i] != 0; i += 2) if (pmap_debug_range || pmap_debug) printf("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n", i, phys_avail[i], i+1, phys_avail[i+1]); /* * Calculate the size of kernel virtual memory, and the size and mask * for the kernel tsb. */ virtsz = roundup(physsz, PAGE_SIZE_4M << (PAGE_SHIFT - TTE_SHIFT)); vm_max_kernel_address = VM_MIN_KERNEL_ADDRESS + virtsz; /* * Set the start and end of kva. The kernel is loaded at the first * available 4 meg super page, so round up to the end of the page. */ virtual_avail = roundup2(ekva, PAGE_SIZE_4M); virtual_end = vm_max_kernel_address; kernel_vm_end = vm_max_kernel_address; /* * Allocate and map a 4MB page for the kernel hashtable * */ #ifndef SIMULATOR kernel_hash_shift = 10; /* PAGE_SIZE_4M*2 */ #else kernel_hash_shift = 6; /* PAGE_SIZE_8K*64 */ #endif kernel_hash_pa = pmap_bootstrap_alloc((1<<(kernel_hash_shift + PAGE_SHIFT))); if (kernel_hash_pa & PAGE_MASK_4M) panic("pmap_bootstrap: hashtable pa unaligned\n"); /* * Set up TSB descriptors for the hypervisor * */ #ifdef notyet tsb_8k_size = virtsz >> (PAGE_SHIFT - TTE_SHIFT); #else /* avoid alignment complaints from the hypervisor */ tsb_8k_size = PAGE_SIZE_4M; #endif tsb_8k_pa = pmap_bootstrap_alloc(tsb_8k_size); if (tsb_8k_pa & PAGE_MASK_4M) panic("pmap_bootstrap: tsb unaligned\n"); KDPRINTF("tsb_8k_size is 0x%lx, tsb_8k_pa is 0x%lx\n", tsb_8k_size, tsb_8k_pa); tsb_4m_size = (virtsz >> (PAGE_SHIFT_4M - TTE_SHIFT)) << 3; tsb_4m_pa = pmap_bootstrap_alloc(tsb_4m_size); kernel_td[TSB8K_INDEX].hti_idxpgsz = TTE8K; kernel_td[TSB8K_INDEX].hti_assoc = 1; kernel_td[TSB8K_INDEX].hti_ntte = (tsb_8k_size >> TTE_SHIFT); kernel_td[TSB8K_INDEX].hti_ctx_index = 0; kernel_td[TSB8K_INDEX].hti_pgszs = TSB8K; kernel_td[TSB8K_INDEX].hti_rsvd = 0; kernel_td[TSB8K_INDEX].hti_ra = tsb_8k_pa; /* * Initialize kernel's private TSB from 8K page TSB * */ kernel_pmap->pm_tsb.hti_idxpgsz = TTE8K; kernel_pmap->pm_tsb.hti_assoc = 1; kernel_pmap->pm_tsb.hti_ntte = (tsb_8k_size >> TTE_SHIFT); kernel_pmap->pm_tsb.hti_ctx_index = 0; kernel_pmap->pm_tsb.hti_pgszs = TSB8K; kernel_pmap->pm_tsb.hti_rsvd = 0; kernel_pmap->pm_tsb.hti_ra = tsb_8k_pa; kernel_pmap->pm_tsb_ra = vtophys((vm_offset_t)&kernel_pmap->pm_tsb); tsb_set_scratchpad_kernel(&kernel_pmap->pm_tsb); /* * Initialize kernel TSB for 4M pages * currently (not by design) used for permanent mappings */ KDPRINTF("tsb_4m_pa is 0x%lx tsb_4m_size is 0x%lx\n", tsb_4m_pa, tsb_4m_size); kernel_td[TSB4M_INDEX].hti_idxpgsz = TTE4M; kernel_td[TSB4M_INDEX].hti_assoc = 1; kernel_td[TSB4M_INDEX].hti_ntte = (tsb_4m_size >> TTE_SHIFT); kernel_td[TSB4M_INDEX].hti_ctx_index = 0; kernel_td[TSB4M_INDEX].hti_pgszs = TSB4M|TSB256M; kernel_td[TSB4M_INDEX].hti_rsvd = 0; kernel_td[TSB4M_INDEX].hti_ra = tsb_4m_pa; /* * allocate MMU fault status areas for all CPUS */ mmu_fault_status_area = pmap_bootstrap_alloc(MMFSA_SIZE*MAXCPU); /* * Allocate and map the message buffer. */ msgbuf_phys = pmap_bootstrap_alloc(MSGBUF_SIZE); msgbufp = (struct msgbuf *)TLB_PHYS_TO_DIRECT(msgbuf_phys); /* * Allocate a kernel stack with guard page for thread0 and map it into * the kernel tsb. */ pa = pmap_bootstrap_alloc(KSTACK_PAGES*PAGE_SIZE); kstack0_phys = pa; virtual_avail += KSTACK_GUARD_PAGES * PAGE_SIZE; kstack0 = virtual_avail; virtual_avail += KSTACK_PAGES * PAGE_SIZE; for (i = 0; i < KSTACK_PAGES; i++) { pa = kstack0_phys + i * PAGE_SIZE; va = kstack0 + i * PAGE_SIZE; tsb_set_tte_real(&kernel_td[TSB8K_INDEX], va, va, pa | TTE_KERNEL | VTD_8K, 0); } /* * Calculate the last available physical address. */ for (i = 0; phys_avail[i + 2] != 0; i += 2) KDPRINTF("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n", i, phys_avail[i], i+1, phys_avail[i+1]); KDPRINTF("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n", i, phys_avail[i], i+1, phys_avail[i+1]); Maxmem = sparc64_btop(phys_avail[i + 1]); /* * Add the prom mappings to the kernel tsb. */ for (i = 0; i < sz; i++) { CTR3(KTR_PMAP, "translation: start=%#lx size=%#lx tte=%#lx", translations[i].om_start, translations[i].om_size, translations[i].om_tte); KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n", translations[i].om_size, translations[i].om_start, translations[i].om_tte); if (translations[i].om_start < VM_MIN_PROM_ADDRESS || translations[i].om_start > VM_MAX_PROM_ADDRESS) continue; for (off = 0; off < translations[i].om_size; off += PAGE_SIZE) { va = translations[i].om_start + off; pa = TTE_GET_PA(translations[i].om_tte) + off; tsb_assert_invalid(&kernel_td[TSB8K_INDEX], va); tsb_set_tte_real(&kernel_td[TSB8K_INDEX], va, va, pa | TTE_KERNEL | VTD_8K, 0); } } if ((error = hv_mmu_tsb_ctx0(MAX_TSB_INFO, vtophys((vm_offset_t)kernel_td))) != H_EOK) panic("failed to set ctx0 TSBs error: %ld", error); #ifdef SMP mp_set_tsb_desc_ra(vtophys((vm_offset_t)&kernel_td)); #endif /* * setup direct mappings * */ for (i = 0, pa = real_phys_avail[i]; pa != 0; i += 2, pa = real_phys_avail[i]) { vm_paddr_t tag_pa = 0, next_pa = 0; uint64_t size_bits = VTD_4M; while (pa < real_phys_avail[i + 1]) { if (use_256M_pages && (pa & PAGE_MASK_256M) == 0 && ((pa + PAGE_SIZE_256M) <= real_phys_avail[i + 1])) { tag_pa = pa; size_bits = VTD_256M; next_pa = pa + PAGE_SIZE_256M; } else if (next_pa <= pa) { tag_pa = pa; size_bits = VTD_4M; } tsb_assert_invalid(&kernel_td[TSB4M_INDEX], TLB_PHYS_TO_DIRECT(pa)); tsb_set_tte_real(&kernel_td[TSB4M_INDEX], TLB_PHYS_TO_DIRECT(pa), TLB_PHYS_TO_DIRECT(pa), tag_pa | TTE_KERNEL | size_bits, 0); pa += PAGE_SIZE_4M; } } /* * Get the available physical memory ranges from /memory/reg. These * are only used for kernel dumps, but it may not be wise to do prom * calls in that situation. */ if ((sz = OF_getproplen(pmem, "reg")) == -1) panic("pmap_bootstrap: getproplen /memory/reg"); if (sizeof(sparc64_memreg) < sz) panic("pmap_bootstrap: sparc64_memreg too small"); if (OF_getprop(pmem, "reg", sparc64_memreg, sz) == -1) panic("pmap_bootstrap: getprop /memory/reg"); sparc64_nmemreg = sz / sizeof(*sparc64_memreg); pm = kernel_pmap; pm->pm_active = ~0; pm->pm_tlbactive = ~0; PMAP_LOCK_INIT(kernel_pmap); TAILQ_INIT(&kernel_pmap->pm_pvlist); /* * This could happen earlier - but I put it here to avoid * attempts to do updates until they're legal */ pm->pm_hash = tte_hash_kernel_create(TLB_PHYS_TO_DIRECT(kernel_hash_pa), kernel_hash_shift, pmap_bootstrap_alloc(PAGE_SIZE)); pm->pm_hashscratch = tte_hash_set_scratchpad_kernel(pm->pm_hash); for (i = 0; i < translations_size; i++) { KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n", translations[i].om_size, translations[i].om_start, translations[i].om_tte); if (translations[i].om_start < VM_MIN_PROM_ADDRESS || translations[i].om_start > VM_MAX_PROM_ADDRESS) { KDPRINTF("skipping\n"); continue; } for (off = 0; off < translations[i].om_size; off += PAGE_SIZE) { va = translations[i].om_start + off; pa = TTE_GET_PA(translations[i].om_tte) + off; tte_hash_insert(pm->pm_hash, va, pa | TTE_KERNEL | VTD_8K); } KDPRINTF("set om_size=%ld om_start=%lx om_tte=%lx\n", translations[i].om_size, translations[i].om_start, translations[i].om_tte); } for (i = 0; i < KSTACK_PAGES; i++) { pa = kstack0_phys + i * PAGE_SIZE; va = kstack0 + i * PAGE_SIZE; tte_hash_insert(pm->pm_hash, va, pa | TTE_KERNEL | VTD_8K); } /* * Add direct mappings to hash * */ #ifdef notyet /* hash only supports 8k pages */ for (pa = PAGE_SIZE_4M; pa < phys_avail[2]; pa += PAGE_SIZE_4M) tte_hash_insert(pm->pm_hash, TLB_PHYS_TO_DIRECT(pa), pa | TTE_KERNEL | VTD_4M); #endif if (bootverbose) printf("pmap_bootstrap done\n"); } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) { boolean_t iswired; PMAP_LOCK(pmap); iswired = tte_get_virt_bit(pmap, va, VTD_WIRED); if (wired && !iswired) { pmap->pm_stats.wired_count++; tte_set_virt_bit(pmap, va, VTD_WIRED); } else if (!wired && iswired) { pmap->pm_stats.wired_count--; tte_clear_virt_bit(pmap, va, VTD_WIRED); } PMAP_UNLOCK(pmap); } void pmap_clear_modify(vm_page_t m) { KDPRINTF("pmap_clear_modify(0x%lx)\n", VM_PAGE_TO_PHYS(m)); tte_clear_phys_bit(m, VTD_W); } void pmap_clear_reference(vm_page_t m) { KDPRINTF("pmap_clear_reference(0x%lx)\n", VM_PAGE_TO_PHYS(m)); tte_clear_phys_bit(m, VTD_REF); } void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { vm_offset_t addr, end_addr; end_addr = src_addr + len; /* * Don't let optional prefaulting of pages make us go * way below the low water mark of free pages or way * above high water mark of used pv entries. */ if (cnt.v_free_count < cnt.v_free_reserved || pv_entry_count > pv_entry_high_water) return; vm_page_lock_queues(); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } for (addr = src_addr; addr < end_addr; addr += PAGE_SIZE) { tte_t tte_data; vm_page_t m; tte_data = tte_hash_lookup(src_pmap->pm_hash, addr); if ((tte_data & VTD_MANAGED) != 0) { if (tte_hash_lookup(dst_pmap->pm_hash, addr) == 0) { m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data)); tte_hash_insert(dst_pmap->pm_hash, addr, tte_data & ~(VTD_W|VTD_REF|VTD_WIRED)); dst_pmap->pm_stats.resident_count++; pmap_insert_entry(dst_pmap, addr, m); } } } vm_page_unlock_queues(); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } void pmap_copy_page(vm_page_t src, vm_page_t dst) { vm_paddr_t srcpa, dstpa; srcpa = VM_PAGE_TO_PHYS(src); dstpa = VM_PAGE_TO_PHYS(dst); novbcopy((char *)TLB_PHYS_TO_DIRECT(srcpa), (char *)TLB_PHYS_TO_DIRECT(dstpa), PAGE_SIZE); } static __inline void pmap_add_tte(pmap_t pmap, vm_offset_t va, vm_page_t m, tte_t *tte_data, int wired) { if (wired) pmap->pm_stats.wired_count++; if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { pmap_insert_entry(pmap, va, m); *tte_data |= VTD_MANAGED; } } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { vm_paddr_t pa, opa; uint64_t tte_data, otte_data; vm_page_t om; int invlva; if (pmap->pm_context) DPRINTF("pmap_enter(va=%lx, pa=0x%lx, prot=%x)\n", va, VM_PAGE_TO_PHYS(m), prot); om = NULL; vm_page_lock_queues(); PMAP_LOCK(pmap); tte_data = pa = VM_PAGE_TO_PHYS(m); otte_data = tte_hash_delete(pmap->pm_hash, va); opa = TTE_GET_PA(otte_data); if (opa == 0) { /* * This is a new mapping */ pmap->pm_stats.resident_count++; pmap_add_tte(pmap, va, m, &tte_data, wired); } else if (pa != opa) { /* * Mapping has changed, handle validating new mapping. * */ if (otte_data & VTD_WIRED) pmap->pm_stats.wired_count--; if (otte_data & VTD_MANAGED) { om = PHYS_TO_VM_PAGE(opa); pmap_remove_entry(pmap, om, va); } pmap_add_tte(pmap, va, m, &tte_data, wired); } else /* (pa == opa) */ { /* * Mapping has not changed, must be protection or wiring change. */ /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((otte_data & VTD_WIRED) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (otte_data & VTD_WIRED)) pmap->pm_stats.wired_count--; /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (otte_data & VTD_MANAGED) { om = m; tte_data |= VTD_MANAGED; } } /* * Now validate mapping with desired protection/wiring. */ if ((prot & VM_PROT_WRITE) != 0) { tte_data |= VTD_SW_W; vm_page_flag_set(m, PG_WRITEABLE); } if ((prot & VM_PROT_EXECUTE) != 0) tte_data |= VTD_X; if (wired) tte_data |= VTD_WIRED; if (pmap == kernel_pmap) tte_data |= VTD_P; invlva = FALSE; if ((otte_data & ~(VTD_W|VTD_REF)) != tte_data) { if (otte_data & VTD_V) { if (otte_data & VTD_REF) { if (otte_data & VTD_MANAGED) vm_page_flag_set(om, PG_REFERENCED); if ((opa != pa) || ((opa & VTD_X) != (pa & VTD_X))) invlva = TRUE; } if (otte_data & VTD_W) { if (otte_data & VTD_MANAGED) vm_page_dirty(om); if ((pa & VTD_SW_W) != 0) invlva = TRUE; } if (invlva) pmap_invalidate_page(pmap, va, TRUE); } } tte_hash_insert(pmap->pm_hash, va, tte_data|TTE_MINFLAGS|VTD_REF); /* * XXX this needs to be locked for the threaded / kernel case */ tsb_set_tte(&pmap->pm_tsb, va, tte_data|TTE_MINFLAGS|VTD_REF, pmap->pm_context); if (tte_hash_needs_resize(pmap->pm_hash)) pmap_tte_hash_resize(pmap); /* * 512 is an arbitrary number of tsb misses */ if (0 && pmap->pm_context != 0 && pmap->pm_tsb_miss_count > 512) pmap_tsb_resize(pmap); vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED); psize = atop(end - start); m = m_start; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot); m = TAILQ_NEXT(m, listq); } PMAP_UNLOCK(pmap); } void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { PMAP_LOCK(pmap); pmap_enter_quick_locked(pmap, va, m, prot); PMAP_UNLOCK(pmap); } static void pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { tte_t tte_data; if (pmap->pm_context) KDPRINTF("pmap_enter_quick(ctx=0x%lx va=%lx, pa=0x%lx prot=%x)\n", pmap->pm_context, va, VM_PAGE_TO_PHYS(m), prot); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (tte_hash_lookup(pmap->pm_hash, va)) return; tte_data = VM_PAGE_TO_PHYS(m); /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { pmap_insert_entry(pmap, va, m); tte_data |= VTD_MANAGED; } pmap->pm_stats.resident_count++; if ((prot & VM_PROT_EXECUTE) != 0) tte_data |= VTD_X; tte_hash_insert(pmap->pm_hash, va, tte_data | TTE_MINFLAGS); } /* * Extract the physical page address associated with the given * map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t pa; tte_t tte_data; tte_data = tte_hash_lookup(pmap->pm_hash, va); pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data)); return (pa); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { tte_t tte_data; vm_page_t m; m = NULL; vm_page_lock_queues(); PMAP_LOCK(pmap); tte_data = tte_hash_lookup(pmap->pm_hash, va); if (tte_data != 0 && ((tte_data & VTD_SW_W) || (prot & VM_PROT_WRITE) == 0)) { m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data)); vm_page_hold(m); } vm_page_unlock_queues(); PMAP_UNLOCK(pmap); return (m); } void * pmap_alloc_zeroed_contig_pages(int npages, uint64_t alignment) { vm_page_t m, tm; int i; void *ptr; m = NULL; while (m == NULL) { for (i = 0; phys_avail[i + 1] != 0; i += 2) { m = vm_phys_alloc_contig(npages, phys_avail[i], phys_avail[i + 1], alignment, (1UL<<34)); if (m) goto found; } if (m == NULL) { printf("vm_phys_alloc_contig failed - waiting to retry\n"); VM_WAIT; } } found: for (i = 0, tm = m; i < npages; i++, tm++) { tm->wire_count++; if ((tm->flags & PG_ZERO) == 0) pmap_zero_page(tm); } ptr = (void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); return (ptr); } void pmap_free_contig_pages(void *ptr, int npages) { int i; vm_page_t m; m = PHYS_TO_VM_PAGE(TLB_DIRECT_TO_PHYS((vm_offset_t)ptr)); for (i = 0; i < npages; i++, m++) { m->wire_count--; atomic_subtract_int(&cnt.v_wire_count, 1); vm_page_free(m); } } void pmap_growkernel(vm_offset_t addr) { return; } void pmap_init(void) { /* allocate pv_entry zones */ int shpgperproc = PMAP_SHPGPERPROC; for (ctx_stack_top = 1; ctx_stack_top < PMAP_CONTEXT_MAX; ctx_stack_top++) ctx_stack[ctx_stack_top] = ctx_stack_top; mtx_init(&pmap_ctx_lock, "ctx lock", NULL, MTX_SPIN); /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_high_water = 9 * (pv_entry_max / 10); uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max); tte_hash_init(); } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; KDPRINTF("pmap_insert_entry(va=0x%lx, pa=0x%lx)\n", va, VM_PAGE_TO_PHYS(m)); pv = get_pv_entry(pmap); pv->pv_va = va; pv->pv_pmap = pmap; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; } #ifdef TRAP_TRACING static int trap_trace_report_done; #endif #ifdef SMP static cpumask_t pmap_ipi(pmap_t pmap, char *func, uint64_t arg1, uint64_t arg2) { int i, cpu_count, retried; u_int cpus; cpumask_t cpumask, active, curactive; cpumask_t active_total, ackmask; uint16_t *cpulist; retried = 0; if (!smp_started) return (0); cpumask = PCPU_GET(cpumask); cpulist = PCPU_GET(cpulist); curactive = 0; if (rdpr(pil) != 14) panic("pil %ld != 14", rdpr(pil)); #ifndef CPUMASK_NOT_BEING_ERRONEOUSLY_CHANGED /* by definition cpumask should have curcpu's bit set */ if (cpumask != (1 << curcpu)) panic("cpumask(0x%x) != (1 << curcpu) (0x%x)\n", cpumask, (1 << curcpu)); #endif #ifdef notyet if ((active_total = (pmap->pm_tlbactive & ~cpumask)) == 0) goto done; if (pmap->pm_context != 0) active_total = active = (pmap->pm_tlbactive & ~cpumask); else #endif active_total = active = PCPU_GET(other_cpus); if (active == 0) goto done; retry: for (i = curactive = cpu_count = 0, cpus = active; i < mp_ncpus && cpus; i++, cpus = (cpus>>1)) { if ((cpus & 0x1) == 0) continue; curactive |= (1 << i); cpulist[cpu_count] = (uint16_t)i; cpu_count++; } ackmask = 0; cpu_ipi_selected(cpu_count, cpulist, (uint64_t)func, (uint64_t)arg1, (uint64_t)arg2, (uint64_t *)&ackmask); while (ackmask != curactive) { membar(Sync); i++; if (i > 10000000) { #ifdef TRAP_TRACING int j; #endif uint64_t cpu_state; printf("cpu with cpumask=0x%x appears to not be responding to ipis\n", curactive & ~ackmask); #ifdef TRAP_TRACING if (!trap_trace_report_done) { trap_trace_report_done = 1; for (j = 0; j < MAXCPU; j++) if (((1 << j) & curactive & ~ackmask) != 0) { struct pcpu *pc = pcpu_find(j); printf("pcpu pad 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx\n", pc->pad[0], pc->pad[1], pc->pad[2], pc->pad[3], pc->pad[4], pc->pad[5], pc->pad[6]); trap_trace_report(j); } } #endif hv_cpu_state((uint64_t)ffs64(curactive & ~ackmask), &cpu_state); printf("cpu_state of %ld is %ld\n", ffs64(curactive & ~ackmask), cpu_state); if (!retried) { printf("I'm going to send off another ipi just to confirm that it isn't a memory barrier bug\n" "and then I'm going to panic\n"); retried = 1; goto retry; } panic(" ackmask=0x%x active=0x%x\n", ackmask, curactive); } } active_total |= curactive; if ((active = ((pmap->pm_tlbactive & all_cpus) & ~(active_total|cpumask))) != 0) { printf("pmap_ipi: retrying"); goto retry; } done: return (active_total); } #endif void pmap_invalidate_page(pmap_t pmap, vm_offset_t va, int cleartsb) { if (cleartsb == TRUE) tsb_clear_tte(&pmap->pm_tsb, va); DPRINTF("pmap_invalidate_page(va=0x%lx)\n", va); spinlock_enter(); invlpg(va, pmap->pm_context); #ifdef SMP pmap_ipi(pmap, (void *)tl_invlpg, (uint64_t)va, (uint64_t)pmap->pm_context); #endif spinlock_exit(); } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int cleartsb) { vm_offset_t tva, invlrngva; char *func; #ifdef SMP cpumask_t active; #endif if ((eva - sva) == PAGE_SIZE) { pmap_invalidate_page(pmap, sva, cleartsb); return; } KASSERT(sva < eva, ("invalidating negative or zero range sva=0x%lx eva=0x%lx", sva, eva)); if (cleartsb == TRUE) tsb_clear_range(&pmap->pm_tsb, sva, eva); spinlock_enter(); if ((sva - eva) < PAGE_SIZE*64) { for (tva = sva; tva < eva; tva += PAGE_SIZE_8K) invlpg(tva, pmap->pm_context); func = tl_invlrng; } else if (pmap->pm_context) { func = tl_invlctx; invlctx(pmap->pm_context); } else { func = tl_invltlb; invltlb(); } #ifdef SMP invlrngva = sva | ((eva - sva) >> PAGE_SHIFT); active = pmap_ipi(pmap, (void *)func, pmap->pm_context, invlrngva); active &= ~pmap->pm_active; atomic_clear_int(&pmap->pm_tlbactive, active); #endif spinlock_exit(); } void pmap_invalidate_all(pmap_t pmap) { KASSERT(pmap != kernel_pmap, ("invalidate_all called on kernel_pmap")); tsb_clear(&pmap->pm_tsb); spinlock_enter(); invlctx(pmap->pm_context); #ifdef SMP pmap_ipi(pmap, tl_invlctx, pmap->pm_context, 0); pmap->pm_tlbactive = pmap->pm_active; #endif spinlock_exit(); } boolean_t pmap_is_modified(vm_page_t m) { return (tte_get_phys_bit(m, VTD_W)); } boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t va) { return (tte_hash_lookup(pmap->pm_hash, va) == 0); } /* * Extract the physical page address associated with the given kernel virtual * address. */ vm_paddr_t pmap_kextract(vm_offset_t va) { tte_t tte_data; vm_paddr_t pa; pa = 0; if (va > KERNBASE && va < KERNBASE + nucleus_memory) { uint64_t offset; offset = va - KERNBASE; pa = nucleus_mappings[offset >> 22] | (va & PAGE_MASK_4M); } if ((pa == 0) && (tte_data = tsb_lookup_tte(va, 0)) != 0) pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data)); if ((pa == 0) && (tte_data = tte_hash_lookup(kernel_pmap->pm_hash, va)) != 0) pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data)); return pa; } /* * Map a range of physical addresses into kernel virtual address space. * * The value passed in *virt is a suggested virtual address for the mapping. * Architectures which can support a direct-mapped physical to virtual region * can return the appropriate address within that region, leaving '*virt' * unchanged. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return TLB_PHYS_TO_DIRECT(start); } int pmap_mincore(pmap_t pmap, vm_offset_t addr) { return (0); } void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t index, vm_size_t size) { printf("pmap_object_init_pt\n"); return; } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { pv_entry_t pv; int loops = 0; if (m->flags & PG_FICTITIOUS) return FALSE; mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pv->pv_pmap == pmap) { return TRUE; } loops++; if (loops >= 16) break; } return (FALSE); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_list_count = 0; } /* * Lower the permission for all mappings to a given page. */ void pmap_remove_write(vm_page_t m) { if ((m->flags & PG_WRITEABLE) == 0) return; mtx_assert(&vm_page_queue_mtx, MA_OWNED); tte_clear_phys_bit(m, VTD_SW_W|VTD_W); vm_page_flag_clear(m, PG_WRITEABLE); } /* * Initialize the pmap associated with process 0. */ void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); pmap->pm_active = pmap->pm_tlbactive = ~0; pmap->pm_context = 0; pmap->pm_tsb_ra = kernel_pmap->pm_tsb_ra; pmap->pm_hash = kernel_pmap->pm_hash; critical_enter(); PCPU_SET(curpmap, pmap); critical_exit(); TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /* * Initialize a preallocated and zeroed pmap structure, such as one in a * vmspace structure. */ -void +int pmap_pinit(pmap_t pmap) { int i; pmap->pm_context = get_context(); pmap->pm_tsb_ra = vtophys(&pmap->pm_tsb); vm_page_lock_queues(); pmap->pm_hash = tte_hash_create(pmap->pm_context, &pmap->pm_hashscratch); tsb_init(&pmap->pm_tsb, &pmap->pm_tsbscratch, TSB_INIT_SHIFT); vm_page_unlock_queues(); pmap->pm_tsb_miss_count = pmap->pm_tsb_cap_miss_count = 0; pmap->pm_active = pmap->pm_tlbactive = 0; for (i = 0; i < TSB_MAX_RESIZE; i++) pmap->pm_old_tsb_ra[i] = 0; TAILQ_INIT(&pmap->pm_pvlist); PMAP_LOCK_INIT(pmap); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + return (1); } /* * Set the physical protection on the specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { int anychanged; vm_offset_t tva; uint64_t clearbits; DPRINTF("pmap_protect(0x%lx, 0x%lx, %d)\n", sva, eva, prot); if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; clearbits = anychanged = 0; if ((prot & VM_PROT_WRITE) == 0) clearbits |= (VTD_W|VTD_SW_W); if ((prot & VM_PROT_EXECUTE) == 0) clearbits |= VTD_X; vm_page_lock_queues(); PMAP_LOCK(pmap); for (tva = sva; tva < eva; tva += PAGE_SIZE) { uint64_t otte_data; vm_page_t m; if ((otte_data = tte_hash_clear_bits(pmap->pm_hash, tva, clearbits)) == 0) continue; /* * XXX technically we should do a shootdown if it * was referenced and was executable - but is not now */ if (!anychanged && (otte_data & VTD_W)) anychanged = 1; if (otte_data & VTD_MANAGED) { m = NULL; if (otte_data & VTD_REF) { m = PHYS_TO_VM_PAGE(TTE_GET_PA(otte_data)); vm_page_flag_set(m, PG_REFERENCED); } if (otte_data & VTD_W) { m = PHYS_TO_VM_PAGE(TTE_GET_PA(otte_data)); vm_page_dirty(m); } } } vm_page_unlock_queues(); if (anychanged) pmap_invalidate_range(pmap, sva, eva, TRUE); PMAP_UNLOCK(pmap); } /* * Map a list of wired pages into kernel virtual address space. This is * intended for temporary mappings which do not need page modification or * references recorded. Existing mappings in the region are overwritten. */ void pmap_qenter(vm_offset_t sva, vm_page_t *m, int count) { vm_offset_t va; tte_t otte; otte = 0; va = sva; while (count-- > 0) { otte |= tte_hash_update(kernel_pmap->pm_hash, va, VM_PAGE_TO_PHYS(*m) | TTE_KERNEL | VTD_8K); va += PAGE_SIZE; m++; } if ((otte & VTD_REF) != 0) pmap_invalidate_range(kernel_pmap, sva, va, FALSE); } /* * Remove page mappings from kernel virtual address space. Intended for * temporary mappings entered by pmap_qenter. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; tte_t otte; va = sva; otte = 0; while (count-- > 0) { otte |= tte_hash_delete(kernel_pmap->pm_hash, va); va += PAGE_SIZE; } if ((otte & VTD_REF) != 0) pmap_invalidate_range(kernel_pmap, sva, va, TRUE); } /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); tsb_deinit(&pmap->pm_tsb); tte_hash_destroy(pmap->pm_hash); free_context(pmap->pm_context); PMAP_LOCK_DESTROY(pmap); } /* * Remove the given range of addresses from the specified map. */ void pmap_remove(pmap_t pmap, vm_offset_t start, vm_offset_t end) { int invlva; vm_offset_t tva; uint64_t tte_data; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; DPRINTF("pmap_remove(start=0x%lx, end=0x%lx)\n", start, end); invlva = 0; vm_page_lock_queues(); PMAP_LOCK(pmap); for (tva = start; tva < end; tva += PAGE_SIZE) { if ((tte_data = tte_hash_delete(pmap->pm_hash, tva)) == 0) continue; pmap_remove_tte(pmap, tte_data, tva); if (tte_data & (VTD_REF|VTD_W)) invlva = 1; } vm_page_unlock_queues(); if (invlva) pmap_invalidate_range(pmap, start, end, TRUE); PMAP_UNLOCK(pmap); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { pv_entry_t pv; uint64_t tte_data; DPRINTF("pmap_remove_all 0x%lx\n", VM_PAGE_TO_PHYS(m)); mtx_assert(&vm_page_queue_mtx, MA_OWNED); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { PMAP_LOCK(pv->pv_pmap); pv->pv_pmap->pm_stats.resident_count--; tte_data = tte_hash_delete(pv->pv_pmap->pm_hash, pv->pv_va); if (tte_data & VTD_WIRED) pv->pv_pmap->pm_stats.wired_count--; if (tte_data & VTD_REF) vm_page_flag_set(m, PG_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (tte_data & VTD_W) { KASSERT((tte_data & VTD_SW_W), ("pmap_remove_all: modified page not writable: va: %lx, tte: %lx", pv->pv_va, tte_data)); vm_page_dirty(m); } pmap_invalidate_page(pv->pv_pmap, pv->pv_va, TRUE); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; PMAP_UNLOCK(pv->pv_pmap); free_pv_entry(pv); } vm_page_flag_clear(m, PG_WRITEABLE); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { pv_entry_t pv; if (pmap != kernel_pmap) DPRINTF("pmap_remove_entry(va=0x%lx, pa=0x%lx)\n", va, VM_PAGE_TO_PHYS(m)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->md.pv_list_count < pmap->pm_stats.resident_count) { TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pmap == pv->pv_pmap && va == pv->pv_va) break; } } else { TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { if (va == pv->pv_va) break; } } KASSERT(pv != NULL, ("pmap_remove_entry: pv not found va=0x%lx pa=0x%lx", va, VM_PAGE_TO_PHYS(m))); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); } void pmap_remove_pages(pmap_t pmap) { vm_page_t m; pv_entry_t pv, npv; tte_t tte_data; DPRINTF("pmap_remove_pages(ctx=0x%lx)\n", pmap->pm_context); vm_page_lock_queues(); PMAP_LOCK(pmap); for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { tte_data = tte_hash_delete(pmap->pm_hash, pv->pv_va); if (tte_data == 0) { printf("TTE IS ZERO @ VA %016lx\n", pv->pv_va); panic("bad tte"); } if (tte_data & VTD_WIRED) { panic("wired page in process not handled correctly"); pmap->pm_stats.wired_count--; } m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data)); pmap->pm_stats.resident_count--; if (tte_data & VTD_W) { vm_page_dirty(m); } npv = TAILQ_NEXT(pv, pv_plist); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); m->md.pv_list_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); free_pv_entry(pv); } pmap->pm_hash = tte_hash_reset(pmap->pm_hash, &pmap->pm_hashscratch); if (0) pmap_tsb_reset(pmap); vm_page_unlock_queues(); pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } static void pmap_tsb_reset(pmap_t pmap) { int i; for (i = 1; i < TSB_MAX_RESIZE && pmap->pm_old_tsb_ra[i]; i++) { pmap_free_contig_pages((void *)TLB_PHYS_TO_DIRECT(pmap->pm_old_tsb_ra[i]), (1 << (TSB_INIT_SHIFT + i))); pmap->pm_old_tsb_ra[i] = 0; } if (pmap->pm_old_tsb_ra[0] != 0) { vm_paddr_t tsb_pa = pmap->pm_tsb.hti_ra; int size = tsb_size(&pmap->pm_tsb); pmap->pm_tsb.hti_ntte = (1 << (TSB_INIT_SHIFT + PAGE_SHIFT - TTE_SHIFT)); pmap->pm_tsb.hti_ra = pmap->pm_old_tsb_ra[0]; pmap_free_contig_pages((void *)TLB_PHYS_TO_DIRECT(tsb_pa), size); pmap->pm_tsbscratch = pmap->pm_tsb.hti_ra | (uint64_t)TSB_INIT_SHIFT; pmap->pm_old_tsb_ra[0] = 0; } } void pmap_scrub_pages(vm_paddr_t pa, int64_t size) { uint64_t bytes_zeroed; while (size > 0) { hv_mem_scrub(pa, size, &bytes_zeroed); pa += bytes_zeroed; size -= bytes_zeroed; } } static void pmap_remove_tte(pmap_t pmap, tte_t tte_data, vm_offset_t va) { vm_page_t m; if (pmap != kernel_pmap) DPRINTF("pmap_remove_tte(va=0x%lx, pa=0x%lx)\n", va, TTE_GET_PA(tte_data)); mtx_assert(&vm_page_queue_mtx, MA_OWNED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (tte_data & VTD_WIRED) pmap->pm_stats.wired_count--; pmap->pm_stats.resident_count--; if (tte_data & VTD_MANAGED) { m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data)); if (tte_data & VTD_W) { vm_page_dirty(m); } if (tte_data & VTD_REF) vm_page_flag_set(m, PG_REFERENCED); pmap_remove_entry(pmap, m, va); } } /* resize the tsb if the number of capacity misses is greater than 1/4 of * the total */ static void pmap_tsb_resize(pmap_t pmap) { uint32_t miss_count; uint32_t cap_miss_count; struct tsb_resize_info info; hv_tsb_info_t hvtsb; uint64_t tsbscratch; KASSERT(pmap == curthread_pmap, ("operating on non-current pmap")); miss_count = pmap->pm_tsb_miss_count; cap_miss_count = pmap->pm_tsb_cap_miss_count; int npages_shift = tsb_page_shift(pmap); if (npages_shift < (TSB_INIT_SHIFT + TSB_MAX_RESIZE) && cap_miss_count > (miss_count >> 1)) { DPRINTF("resizing tsb for proc=%s pid=%d\n", curthread->td_proc->p_comm, curthread->td_proc->p_pid); pmap->pm_old_tsb_ra[npages_shift - TSB_INIT_SHIFT] = pmap->pm_tsb.hti_ra; /* double TSB size */ tsb_init(&hvtsb, &tsbscratch, npages_shift + 1); #ifdef SMP spinlock_enter(); /* reset tsb */ bcopy(&hvtsb, &pmap->pm_tsb, sizeof(hv_tsb_info_t)); pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb); if (hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra) != H_EOK) panic("failed to set TSB 0x%lx - context == %ld\n", pmap->pm_tsb_ra, pmap->pm_context); info.tri_tsbscratch = pmap->pm_tsbscratch; info.tri_tsb_ra = pmap->pm_tsb_ra; pmap_ipi(pmap, tl_tsbupdate, pmap->pm_context, vtophys(&info)); pmap->pm_tlbactive = pmap->pm_active; spinlock_exit(); #else bcopy(&hvtsb, &pmap->pm_tsb, sizeof(hvtsb)); if (hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra) != H_EOK) panic("failed to set TSB 0x%lx - context == %ld\n", pmap->pm_tsb_ra, pmap->pm_context); pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb); #endif } pmap->pm_tsb_miss_count = 0; pmap->pm_tsb_cap_miss_count = 0; } static void pmap_tte_hash_resize(pmap_t pmap) { tte_hash_t old_th = pmap->pm_hash; pmap->pm_hash = tte_hash_resize(pmap->pm_hash); spinlock_enter(); if (curthread->td_proc->p_numthreads != 1) pmap_ipi(pmap, tl_ttehashupdate, pmap->pm_context, pmap->pm_hashscratch); pmap->pm_hashscratch = tte_hash_set_scratchpad_user(pmap->pm_hash, pmap->pm_context); spinlock_exit(); tte_hash_destroy(old_th); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { int rv; pv_entry_t pv, pvf, pvn; pmap_t pmap; tte_t otte_data; rv = 0; if (m->flags & PG_FICTITIOUS) return (rv); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pvf = pv; do { pvn = TAILQ_NEXT(pv, pv_list); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); pmap = pv->pv_pmap; PMAP_LOCK(pmap); otte_data = tte_hash_clear_bits(pmap->pm_hash, pv->pv_va, VTD_REF); if ((otte_data & VTD_REF) != 0) { pmap_invalidate_page(pmap, pv->pv_va, TRUE); rv++; if (rv > 4) { PMAP_UNLOCK(pmap); break; } } PMAP_UNLOCK(pmap); } while ((pv = pvn) != NULL && pv != pvf); } return (rv); } void pmap_zero_page(vm_page_t m) { hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE); } void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_paddr_t pa; vm_offset_t va; pa = VM_PAGE_TO_PHYS(m); va = TLB_PHYS_TO_DIRECT(pa); if (off == 0 && size == PAGE_SIZE) hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE); else bzero((char *)(va + off), size); } void pmap_zero_page_idle(vm_page_t m) { hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE); } void pmap_set_ctx_panic(uint64_t error, vm_paddr_t tsb_ra, pmap_t pmap) { panic("setting ctxnon0 failed ctx=0x%lx hvtsb_ra=0x%lx tsbscratch=0x%lx error=0x%lx", pmap->pm_context, tsb_ra, pmap->pm_tsbscratch, error); } Index: head/sys/sys/proc.h =================================================================== --- head/sys/sys/proc.h (revision 173360) +++ head/sys/sys/proc.h (revision 173361) @@ -1,916 +1,917 @@ /*- * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)proc.h 8.15 (Berkeley) 5/19/95 * $FreeBSD$ */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include /* For struct callout. */ #include /* For struct klist. */ #ifndef _KERNEL #include #endif #include #include #include #include #include /* XXX. */ #include #include #include #include #include #ifndef _KERNEL #include /* For structs itimerval, timeval. */ #else #include #endif #include #include #include /* Machine-dependent proc substruct. */ /* * One structure allocated per session. * * List of locks * (m) locked by s_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct session { int s_count; /* (m) Ref cnt; pgrps in session. */ struct proc *s_leader; /* (m + e) Session leader. */ struct vnode *s_ttyvp; /* (m) Vnode of controlling tty. */ struct tty *s_ttyp; /* (m) Controlling tty. */ pid_t s_sid; /* (c) Session ID. */ /* (m) Setlogin() name: */ char s_login[roundup(MAXLOGNAME, sizeof(long))]; struct mtx s_mtx; /* Mutex to protect members. */ }; /* * One structure allocated per process group. * * List of locks * (m) locked by pg_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct pgrp { LIST_ENTRY(pgrp) pg_hash; /* (e) Hash chain. */ LIST_HEAD(, proc) pg_members; /* (m + e) Pointer to pgrp members. */ struct session *pg_session; /* (c) Pointer to session. */ struct sigiolst pg_sigiolst; /* (m) List of sigio sources. */ pid_t pg_id; /* (c) Process group id. */ int pg_jobc; /* (m) Job control process count. */ struct mtx pg_mtx; /* Mutex to protect members */ }; /* * pargs, used to hold a copy of the command line, if it had a sane length. */ struct pargs { u_int ar_ref; /* Reference count. */ u_int ar_length; /* Length. */ u_char ar_args[1]; /* Arguments. */ }; /*- * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressable except for those marked "(CPU)" below, * which might be addressable only on a processor on which the process * is running. * * Below is a key of locks used to protect each member of struct proc. The * lock is indicated by a reference to a specific character in parens in the * associated comment. * * - not yet protected * a - only touched by curproc or parent during fork/wait * b - created at fork, never changes * (exception aiods switch vmspaces, but they are also * marked 'P_SYSTEM' so hopefully it will be left alone) * c - locked by proc mtx * d - locked by allproc_lock lock * e - locked by proctree_lock lock * f - session mtx * g - process group mtx * h - callout_lock mtx * i - by curproc or the master session mtx * j - locked by proc slock * k - only accessed by curthread * k*- only accessed by curthread and from an interrupt * l - the attaching proc or attaching proc parent * m - Giant * n - not locked, lazy * o - ktrace lock * p - select lock (sellock) * q - td_contested lock * r - p_peers lock * t - thread lock * x - created at fork, only changes during single threading in exec * z - zombie threads lock * * If the locking key specifies two identifiers (for example, p_pptr) then * either lock is sufficient for read access, but both locks must be held * for write access. */ struct kaudit_record; struct td_sched; struct nlminfo; struct kaioinfo; struct p_sched; struct proc; struct sleepqueue; struct thread; struct trapframe; struct turnstile; struct mqueue_notifier; /* * Here we define the two structures used for process information. * * The first is the thread. It might be thought of as a "Kernel * Schedulable Entity Context". * This structure contains all the information as to where a thread of * execution is now, or was when it was suspended, why it was suspended, * and anything else that will be needed to restart it when it is * rescheduled. It includes a scheduler specific substructure that is different * for each scheduler. * * M:N notes. * It is important to remember that when using M:N threading, * a particular thread structure may only exist as long as * the system call or kernel entrance (e.g. by pagefault) * which it is currently executing. It should therefore NEVER be referenced * by pointers in long lived structures that live longer than a single * request. If several threads complete their work at the same time, * they will all rewind their stacks to the user boundary, report their * completion state, and all but one will be freed. That last one will * be kept to provide a kernel stack and pcb for the NEXT syscall or kernel * entrance (basically to save freeing and then re-allocating it). The existing * thread keeps a cached spare thread available to allow it to quickly * get one when it needs a new one. There is also a system * cache of free threads. Threads have priority and partake in priority * inheritance schemes. * * The second is the proc (process) which owns all the resources of a process * other than CPU cycles, which are parceled out to the threads. */ /* * Kernel runnable context (thread). * This is what is put to sleep and reactivated. * Thread context. Processes may have multiple threads. */ struct thread { struct mtx *volatile td_lock; /* replaces sched lock */ struct proc *td_proc; /* (*) Associated process. */ TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */ /* The two queues below should someday be merged. */ TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ TAILQ_HEAD(, selinfo) td_selq; /* (p) List of selinfos. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ #define td_siglist td_sigqueue.sq_signals /* Cleared during fork1() or thread_schedule_upcall(). */ #define td_startzero td_flags int td_flags; /* (t) TDF_* flags. */ int td_inhibitors; /* (t) Why can not run. */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ int td_dupfd; /* (k) Ret value from fdopen. XXX */ int td_sqqueue; /* (t) Sleepqueue queue blocked on. */ void *td_wchan; /* (t) Sleep address. */ const char *td_wmesg; /* (t) Reason for sleep. */ u_char td_lastcpu; /* (t) Last cpu we were on. */ u_char td_oncpu; /* (t) Which cpu we are on. */ volatile u_char td_owepreempt; /* (k*) Preempt on last critical_exit */ short td_locks; /* (k) Count of non-spin locks. */ u_char td_tsqueue; /* (t) Turnstile queue blocked on. */ struct turnstile *td_blocked; /* (t) Lock thread is blocked on. */ const char *td_lockname; /* (t) Name of lock blocked on. */ LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ int td_pinned; /* (k) Temporary cpu pin count. */ struct kse_thr_mailbox *td_mailbox; /* (*) Userland mailbox address. */ struct ucred *td_ucred; /* (k) Reference to credentials. */ struct thread *td_standin; /* (k + a) Use this for an upcall. */ struct kse_upcall *td_upcall; /* (k + t) Upcall structure. */ u_int td_estcpu; /* (t) estimated cpu utilization */ u_int td_slptick; /* (t) Time at sleep. */ struct rusage td_ru; /* (t) rusage information */ uint64_t td_runtime; /* (t) How many cpu ticks we've run. */ u_int td_pticks; /* (t) Statclock hits for profiling */ u_int td_sticks; /* (t) Statclock hits in system mode. */ u_int td_iticks; /* (t) Statclock hits in intr mode. */ u_int td_uticks; /* (t) Statclock hits in user mode. */ u_int td_uuticks; /* (k) Statclock hits (usr), for UTS. */ u_int td_usticks; /* (k) Statclock hits (sys), for UTS. */ int td_intrval; /* (t) Return value of TDF_INTERRUPT. */ sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */ sigset_t td_sigmask; /* (c) Current signal mask. */ volatile u_int td_generation; /* (k) For detection of preemption */ stack_t td_sigstk; /* (k) Stack ptr and on-stack flag. */ int td_kflags; /* (c) Flags for KSE threading. */ int td_xsig; /* (c) Signal for ptrace */ u_long td_profil_addr; /* (k) Temporary addr until AST. */ u_int td_profil_ticks; /* (k) Temporary ticks until AST. */ char td_name[MAXCOMLEN + 1]; /* (*) Thread name. */ #define td_endzero td_base_pri /* Copied during fork1() or thread_sched_upcall(). */ #define td_startcopy td_endzero u_char td_base_pri; /* (t) Thread base kernel priority. */ u_char td_priority; /* (t) Thread active priority. */ u_char td_pri_class; /* (t) Scheduling class. */ u_char td_user_pri; /* (t) User pri from estcpu and nice. */ u_char td_base_user_pri; /* (t) Base user pri */ #define td_endcopy td_pcb /* * Fields that must be manually set in fork1() or thread_sched_upcall() * or already have been set in the allocator, constructor, etc. */ struct pcb *td_pcb; /* (k) Kernel VA of pcb and kstack. */ enum { TDS_INACTIVE = 0x0, TDS_INHIBITED, TDS_CAN_RUN, TDS_RUNQ, TDS_RUNNING } td_state; /* (t) thread state */ register_t td_retval[2]; /* (k) Syscall aux returns. */ struct callout td_slpcallout; /* (h) Callout for sleep. */ struct trapframe *td_frame; /* (k) */ struct vm_object *td_kstack_obj;/* (a) Kstack object. */ vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ int td_kstack_pages; /* (a) Size of the kstack. */ struct vm_object *td_altkstack_obj;/* (a) Alternate kstack object. */ vm_offset_t td_altkstack; /* (a) Kernel VA of alternate kstack. */ int td_altkstack_pages; /* (a) Size of alternate kstack. */ volatile u_int td_critnest; /* (k*) Critical section nest level. */ struct mdthread td_md; /* (k) Any machine-dependent fields. */ struct td_sched *td_sched; /* (*) Scheduler-specific data. */ struct kaudit_record *td_ar; /* (k) Active audit record, if any. */ int td_syscalls; /* per-thread syscall count (used by NFS :)) */ }; struct mtx *thread_lock_block(struct thread *); void thread_lock_unblock(struct thread *, struct mtx *); void thread_lock_set(struct thread *, struct mtx *); #define THREAD_LOCK_ASSERT(td, type) \ do { \ struct mtx *__m = (td)->td_lock; \ if (__m != &blocked_lock) \ mtx_assert(__m, (type)); \ } while (0) /* * Flags kept in td_flags: * To change these you MUST have the scheduler lock. */ #define TDF_BORROWING 0x00000001 /* Thread is borrowing pri from another. */ #define TDF_INPANIC 0x00000002 /* Caused a panic, let it drive crashdump. */ #define TDF_INMEM 0x00000004 /* Thread's stack is in memory. */ #define TDF_SINTR 0x00000008 /* Sleep is interruptible. */ #define TDF_TIMEOUT 0x00000010 /* Timing out during sleep. */ #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ #define TDF_SELECT 0x00000040 /* Selecting; wakeup/waiting danger. */ #define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */ #define TDF_KTH_SUSP 0x00000100 /* kthread is suspended */ #define TDF_UBORROWING 0x00000200 /* Thread is borrowing user pri. */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */ #define TDF_TIMOFAIL 0x00001000 /* Timeout from sleep after we were awake. */ #define TDF_INTERRUPT 0x00002000 /* Thread is marked as interrupted. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ #define TDF_UNUSED15 0x00008000 /* --available-- */ #define TDF_NEEDRESCHED 0x00010000 /* Thread needs to yield. */ #define TDF_NEEDSIGCHK 0x00020000 /* Thread may need signal delivery. */ #define TDF_XSIG 0x00040000 /* Thread is exchanging signal under trace */ #define TDF_UNUSED19 0x00080000 /* Thread is sleeping on a umtx. */ #define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */ #define TDF_DBSUSPEND 0x00200000 /* Thread is suspended by debugger */ #define TDF_SWAPINREQ 0x00400000 /* Swapin request due to wakeup. */ #define TDF_UNUSED23 0x00800000 /* --available-- */ #define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */ #define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */ #define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */ #define TDF_SCHED3 0x08000000 /* Reserved for scheduler private use */ #define TDF_ALRMPEND 0x10000000 /* Pending SIGVTALRM needs to be posted. */ #define TDF_PROFPEND 0x20000000 /* Pending SIGPROF needs to be posted. */ #define TDF_MACPEND 0x40000000 /* AST-based MAC event pending. */ /* * "Private" flags kept in td_pflags: * These are only written by curthread and thus need no locking. */ #define TDP_OLDMASK 0x00000001 /* Need to restore mask after suspend. */ #define TDP_INKTR 0x00000002 /* Thread is currently in KTR code. */ #define TDP_INKTRACE 0x00000004 /* Thread is currently in KTRACE code. */ #define TDP_UPCALLING 0x00000008 /* This thread is doing an upcall. */ #define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x00000040 /* Lock aquisition - deadlock treatment. */ #define TDP_SA 0x00000080 /* A scheduler activation based thread. */ #define TDP_NOSLEEPING 0x00000100 /* Thread is not allowed to sleep on a sq. */ #define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */ #define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */ #define TDP_CAN_UNBIND 0x00000800 /* Only temporarily bound. */ #define TDP_SCHED1 0x00001000 /* Reserved for scheduler private use */ #define TDP_SCHED2 0x00002000 /* Reserved for scheduler private use */ #define TDP_SCHED3 0x00004000 /* Reserved for scheduler private use */ #define TDP_SCHED4 0x00008000 /* Reserved for scheduler private use */ #define TDP_GEOM 0x00010000 /* Settle GEOM before finishing syscall */ #define TDP_SOFTDEP 0x00020000 /* Stuck processing softdep worklist */ #define TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */ #define TDP_WAKEUP 0x00080000 /* Don't sleep in umtx cond_wait */ #define TDP_INBDFLUSH 0x00100000 /* Already in BO_BDFLUSH, do not recurse */ #define TDP_KTHREAD 0x00200000 /* This is an official kernel thread */ /* * Reasons that the current thread can not be run yet. * More than one may apply. */ #define TDI_SUSPENDED 0x0001 /* On suspension queue. */ #define TDI_SLEEPING 0x0002 /* Actually asleep! (tricky). */ #define TDI_SWAPPED 0x0004 /* Stack not in mem. Bad juju if run. */ #define TDI_LOCK 0x0008 /* Stopped on a lock. */ #define TDI_IWAIT 0x0010 /* Awaiting interrupt. */ /* * flags (in kflags) related to M:N threading. */ #define TDK_KSEREL 0x0001 /* Blocked in msleep on p->p_completed. */ #define TDK_KSERELSIG 0x0002 /* Blocked in msleep on p->p_siglist. */ #define TDK_WAKEUP 0x0004 /* Thread has been woken by kse_wakeup. */ #define TD_CAN_UNBIND(td) \ (((td)->td_pflags & TDP_CAN_UNBIND) && \ ((td)->td_upcall != NULL)) #define TD_IS_SLEEPING(td) ((td)->td_inhibitors & TDI_SLEEPING) #define TD_ON_SLEEPQ(td) ((td)->td_wchan != NULL) #define TD_IS_SUSPENDED(td) ((td)->td_inhibitors & TDI_SUSPENDED) #define TD_IS_SWAPPED(td) ((td)->td_inhibitors & TDI_SWAPPED) #define TD_ON_LOCK(td) ((td)->td_inhibitors & TDI_LOCK) #define TD_AWAITING_INTR(td) ((td)->td_inhibitors & TDI_IWAIT) #define TD_IS_RUNNING(td) ((td)->td_state == TDS_RUNNING) #define TD_ON_RUNQ(td) ((td)->td_state == TDS_RUNQ) #define TD_CAN_RUN(td) ((td)->td_state == TDS_CAN_RUN) #define TD_IS_INHIBITED(td) ((td)->td_state == TDS_INHIBITED) #define TD_ON_UPILOCK(td) ((td)->td_flags & TDF_UPIBLOCKED) #if 0 #define TD_IS_IDLETHREAD(td) ((td) == pcpu(idlethread)) #else #define TD_IS_IDLETHREAD(td) ((td)->td_flags & TDF_IDLETD) #endif #define TD_SET_INHIB(td, inhib) do { \ (td)->td_state = TDS_INHIBITED; \ (td)->td_inhibitors |= (inhib); \ } while (0) #define TD_CLR_INHIB(td, inhib) do { \ if (((td)->td_inhibitors & (inhib)) && \ (((td)->td_inhibitors &= ~(inhib)) == 0)) \ (td)->td_state = TDS_CAN_RUN; \ } while (0) #define TD_SET_SLEEPING(td) TD_SET_INHIB((td), TDI_SLEEPING) #define TD_SET_SWAPPED(td) TD_SET_INHIB((td), TDI_SWAPPED) #define TD_SET_LOCK(td) TD_SET_INHIB((td), TDI_LOCK) #define TD_SET_SUSPENDED(td) TD_SET_INHIB((td), TDI_SUSPENDED) #define TD_SET_IWAIT(td) TD_SET_INHIB((td), TDI_IWAIT) #define TD_SET_EXITING(td) TD_SET_INHIB((td), TDI_EXITING) #define TD_CLR_SLEEPING(td) TD_CLR_INHIB((td), TDI_SLEEPING) #define TD_CLR_SWAPPED(td) TD_CLR_INHIB((td), TDI_SWAPPED) #define TD_CLR_LOCK(td) TD_CLR_INHIB((td), TDI_LOCK) #define TD_CLR_SUSPENDED(td) TD_CLR_INHIB((td), TDI_SUSPENDED) #define TD_CLR_IWAIT(td) TD_CLR_INHIB((td), TDI_IWAIT) #define TD_SET_RUNNING(td) (td)->td_state = TDS_RUNNING #define TD_SET_RUNQ(td) (td)->td_state = TDS_RUNQ #define TD_SET_CAN_RUN(td) (td)->td_state = TDS_CAN_RUN /* * An upcall is used when returning to userland. If a thread does not have * an upcall on return to userland the thread exports its context and exits. */ struct kse_upcall { TAILQ_ENTRY(kse_upcall) ku_link; /* List of upcalls in proc. */ struct proc *ku_proc; /* Associated proc. */ struct thread *ku_owner; /* Owning thread. */ int ku_flags; /* KUF_* flags. */ struct kse_mailbox *ku_mailbox; /* Userland mailbox address. */ stack_t ku_stack; /* Userland upcall stack. */ void *ku_func; /* Userland upcall function. */ unsigned int ku_mflags; /* Cached upcall mbox flags. */ }; #define KUF_DOUPCALL 0x00001 /* Do upcall now; don't wait. */ #define KUF_EXITING 0x00002 /* Upcall structure is exiting. */ /* * XXX: Does this belong in resource.h or resourcevar.h instead? * Resource usage extension. The times in rusage structs in the kernel are * never up to date. The actual times are kept as runtimes and tick counts * (with control info in the "previous" times), and are converted when * userland asks for rusage info. Backwards compatibility prevents putting * this directly in the user-visible rusage struct. * * Locking: (cj) means (j) for p_rux and (c) for p_crux. */ struct rusage_ext { u_int64_t rux_runtime; /* (cj) Real time. */ u_int64_t rux_uticks; /* (cj) Statclock hits in user mode. */ u_int64_t rux_sticks; /* (cj) Statclock hits in sys mode. */ u_int64_t rux_iticks; /* (cj) Statclock hits in intr mode. */ u_int64_t rux_uu; /* (c) Previous user time in usec. */ u_int64_t rux_su; /* (c) Previous sys time in usec. */ u_int64_t rux_tu; /* (c) Previous total time in usec. */ }; /* * The old fashionned process. May have multiple threads. * Starts off with a single embedded THREAD. */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ TAILQ_HEAD(, thread) p_threads; /* (j) all threads. */ TAILQ_HEAD(, kse_upcall) p_upcalls; /* (j) All upcalls in the proc. */ struct mtx p_slock; /* process spin lock */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Open files. */ struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */ /* Accumulated stats for all threads? */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c) Process limits. */ struct callout p_limco; /* (c) Limit callout handle */ struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ /* * The following don't make too much sense. * See the td_ or ke_ versions of the same flags. */ int p_flag; /* (c) P_* flags. */ enum { PRS_NEW = 0, /* In creation */ PRS_NORMAL, /* threads can be run. */ PRS_ZOMBIE } p_state; /* (j/c) S* process status. */ pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */ struct proc *p_pptr; /* (c + e) Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* (e) List of sibling processes. */ LIST_HEAD(, proc) p_children; /* (e) Pointer to list of children. */ struct mtx p_mtx; /* (n) Lock for this struct. */ struct ksiginfo *p_ksi; /* Locked by parent proc lock */ sigqueue_t p_sigqueue; /* (c) Sigs not delivered to a td. */ #define p_siglist p_sigqueue.sq_signals /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_oppid pid_t p_oppid; /* (c + e) Save ppid in ptrace. XXX */ struct vmspace *p_vmspace; /* (b) Address space. */ u_int p_swtick; /* (j) Tick when swapped in or out. */ struct itimerval p_realtimer; /* (c) Alarm timer. */ struct rusage p_ru; /* (a) Exit information. */ struct rusage_ext p_rux; /* (cj) Internal resource usage. */ struct rusage_ext p_crux; /* (c) Internal child resource usage. */ int p_profthreads; /* (c) Num threads in addupc_task. */ volatile int p_exitthreads; /* (j) Number of threads exiting */ int p_traceflag; /* (o) Kernel trace points. */ struct vnode *p_tracevp; /* (c + o) Trace to vnode. */ struct ucred *p_tracecred; /* (o) Credentials to trace with. */ struct vnode *p_textvp; /* (b) Vnode of executable. */ char p_lock; /* (c) Proclock (prevent swap) count. */ struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ int p_sigparent; /* (c) Signal to parent on exit. */ int p_sig; /* (n) For core dump/debugger XXX. */ u_long p_code; /* (n) For core dump/debugger XXX. */ u_int p_stops; /* (c) Stop event bitmask. */ u_int p_stype; /* (c) Stop event type. */ char p_step; /* (c) Process is stopped. */ u_char p_pfsflags; /* (c) Procfs flags. */ struct nlminfo *p_nlminfo; /* (?) Only used by/for lockd. */ struct kaioinfo *p_aioinfo; /* (c) ASYNC I/O info. */ struct thread *p_singlethread;/* (c + j) If single threading this is it */ int p_suspcount; /* (j) Num threads in suspended mode. */ struct thread *p_xthread; /* (c) Trap thread */ int p_boundary_count;/* (c) Num threads at user boundary */ int p_pendingcnt; /* how many signals are pending */ struct itimers *p_itimers; /* (c) POSIX interval timers. */ int p_numupcalls; /* (j) Num upcalls. */ int p_upsleeps; /* (c) Num threads in kse_release(). */ struct kse_thr_mailbox *p_completed; /* (c) Completed thread mboxes. */ int p_nextupcall; /* (n) Next upcall time. */ int p_upquantum; /* (n) Quantum to schedule an upcall. */ /* End area that is zeroed on creation. */ #define p_endzero p_magic /* The following fields are all copied upon creation in fork. */ #define p_startcopy p_endzero u_int p_magic; /* (b) Magic number. */ char p_comm[MAXCOMLEN + 1]; /* (b) Process name. */ struct pgrp *p_pgrp; /* (c + e) Pointer to process group. */ struct sysentvec *p_sysent; /* (b) Syscall dispatch info. */ struct pargs *p_args; /* (c) Process arguments. */ rlim_t p_cpulimit; /* (c) Current CPU limit in seconds. */ signed char p_nice; /* (c + j) Process "nice" value. */ /* End area that is copied on creation. */ #define p_endcopy p_xstat u_short p_xstat; /* (c) Exit status; also stop sig. */ struct knlist p_klist; /* (c) Knotes attached to this proc. */ int p_numthreads; /* (j) Number of threads. */ struct mdproc p_md; /* Any machine-dependent fields. */ struct callout p_itcallout; /* (h + c) Interval timer callout. */ u_short p_acflag; /* (c) Accounting flags. */ struct proc *p_peers; /* (r) */ struct proc *p_leader; /* (b) */ void *p_emuldata; /* (c) Emulator state data. */ struct label *p_label; /* (*) Proc (not subject) MAC label. */ struct p_sched *p_sched; /* (*) Scheduler-specific data. */ STAILQ_HEAD(, ktr_request) p_ktr; /* (o) KTR event queue. */ LIST_HEAD(, mqueue_notifier) p_mqnotifier; /* (c) mqueue notifiers.*/ }; #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id #define NOCPU 0xff /* For when we aren't on a CPU. */ #define PROC_SLOCK(p) mtx_lock_spin(&(p)->p_slock) #define PROC_SUNLOCK(p) mtx_unlock_spin(&(p)->p_slock) #define PROC_SLOCK_ASSERT(p, type) mtx_assert(&(p)->p_slock, (type)) /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ #define P_CONTROLT 0x00002 /* Has a controlling terminal. */ #define P_KTHREAD 0x00004 /* Kernel thread (*). */ #define P_NOLOAD 0x00008 /* Ignore during load avg calculations. */ #define P_PPWAIT 0x00010 /* Parent is waiting for child to exec/exit. */ #define P_PROFIL 0x00020 /* Has started profiling. */ #define P_STOPPROF 0x00040 /* Has thread requesting to stop profiling. */ #define P_HADTHREADS 0x00080 /* Has had threads (no cleanup shortcuts) */ #define P_SUGID 0x00100 /* Had set id privileges since last exec. */ #define P_SYSTEM 0x00200 /* System proc: no sigs, stats or swapping. */ #define P_SINGLE_EXIT 0x00400 /* Threads suspending should exit, not wait. */ #define P_TRACED 0x00800 /* Debugged process being traced. */ #define P_WAITED 0x01000 /* Someone is waiting for us. */ #define P_WEXIT 0x02000 /* Working on exiting. */ #define P_EXEC 0x04000 /* Process called exec. */ #define P_SA 0x08000 /* Using scheduler activations. */ #define P_CONTINUED 0x10000 /* Proc has continued from a stopped state. */ #define P_STOPPED_SIG 0x20000 /* Stopped due to SIGSTOP/SIGTSTP. */ #define P_STOPPED_TRACE 0x40000 /* Stopped because of tracing. */ #define P_STOPPED_SINGLE 0x80000 /* Only 1 thread can continue (not to user). */ #define P_PROTECTED 0x100000 /* Do not kill on memory overcommit. */ #define P_SIGEVENT 0x200000 /* Process pending signals changed. */ #define P_SINGLE_BOUNDARY 0x400000 /* Threads should suspend at user boundary. */ #define P_HWPMC 0x800000 /* Process is using HWPMCs */ #define P_JAILED 0x1000000 /* Process is in jail. */ #define P_INEXEC 0x4000000 /* Process is in execve(). */ #define P_STATCHILD 0x8000000 /* Child process stopped or exited. */ #define P_INMEM 0x10000000 /* Loaded into memory. */ #define P_SWAPPINGOUT 0x20000000 /* Process is being swapped out. */ #define P_SWAPPINGIN 0x40000000 /* Process is being swapped in. */ #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE) #define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED) /* * These were process status values (p_stat), now they are only used in * legacy conversion code. */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ #define SWAIT 6 /* Waiting for interrupt. */ #define SLOCK 7 /* Blocked on a lock. */ #define P_MAGIC 0xbeefface #ifdef _KERNEL /* Flags for mi_switch(). */ #define SW_VOL 0x0001 /* Voluntary switch. */ #define SW_INVOL 0x0002 /* Involuntary switch. */ #define SW_PREEMPT 0x0004 /* The invol switch is a preemption */ /* How values for thread_single(). */ #define SINGLE_NO_EXIT 0 #define SINGLE_EXIT 1 #define SINGLE_BOUNDARY 2 /* XXXKSE: Missing values for thread_suspend_check(). */ #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_PARGS); MALLOC_DECLARE(M_PGRP); MALLOC_DECLARE(M_SESSION); MALLOC_DECLARE(M_SUBPROC); MALLOC_DECLARE(M_ZOMBIE); #endif #define FOREACH_PROC_IN_SYSTEM(p) \ LIST_FOREACH((p), &allproc, p_list) #define FOREACH_THREAD_IN_PROC(p, td) \ TAILQ_FOREACH((td), &(p)->p_threads, td_plist) #define FOREACH_UPCALL_IN_PROC(p, ku) \ TAILQ_FOREACH((ku), &(p)->p_upcalls, ku_link) /* XXXKSE the following lines should probably only be used in 1:1 code: */ #define FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&(p)->p_threads) /* * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t, * as it is used to represent "no process group". */ #define PID_MAX 99999 #define NO_PID 100000 #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) #define SESSHOLD(s) ((s)->s_count++) #define SESSRELE(s) sessrele(s) #define STOPEVENT(p, e, v) do { \ if ((p)->p_stops & (e)) { \ PROC_LOCK(p); \ stopevent((p), (e), (v)); \ PROC_UNLOCK(p); \ } \ } while (0) #define _STOPEVENT(p, e, v) do { \ PROC_LOCK_ASSERT(p, MA_OWNED); \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, \ "checking stopevent %d", (e)); \ if ((p)->p_stops & (e)) \ stopevent((p), (e), (v)); \ } while (0) /* Lock and unlock a process. */ #define PROC_LOCK(p) mtx_lock(&(p)->p_mtx) #define PROC_TRYLOCK(p) mtx_trylock(&(p)->p_mtx) #define PROC_UNLOCK(p) mtx_unlock(&(p)->p_mtx) #define PROC_LOCKED(p) mtx_owned(&(p)->p_mtx) #define PROC_LOCK_ASSERT(p, type) mtx_assert(&(p)->p_mtx, (type)) /* Lock and unlock a process group. */ #define PGRP_LOCK(pg) mtx_lock(&(pg)->pg_mtx) #define PGRP_UNLOCK(pg) mtx_unlock(&(pg)->pg_mtx) #define PGRP_LOCKED(pg) mtx_owned(&(pg)->pg_mtx) #define PGRP_LOCK_ASSERT(pg, type) mtx_assert(&(pg)->pg_mtx, (type)) #define PGRP_LOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_LOCK(pg); \ } while (0) #define PGRP_UNLOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_UNLOCK(pg); \ } while (0) /* Lock and unlock a session. */ #define SESS_LOCK(s) mtx_lock(&(s)->s_mtx) #define SESS_UNLOCK(s) mtx_unlock(&(s)->s_mtx) #define SESS_LOCKED(s) mtx_owned(&(s)->s_mtx) #define SESS_LOCK_ASSERT(s, type) mtx_assert(&(s)->s_mtx, (type)) /* Hold process U-area in memory, normally for ptrace/procfs work. */ #define PHOLD(p) do { \ PROC_LOCK(p); \ _PHOLD(p); \ PROC_UNLOCK(p); \ } while (0) #define _PHOLD(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process")); \ (p)->p_lock++; \ if (((p)->p_flag & P_INMEM) == 0) \ faultin((p)); \ } while (0) #define PROC_ASSERT_HELD(p) do { \ KASSERT((p)->p_lock > 0, ("process not held")); \ } while (0) #define PRELE(p) do { \ PROC_LOCK((p)); \ _PRELE((p)); \ PROC_UNLOCK((p)); \ } while (0) #define _PRELE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ (--(p)->p_lock); \ if (((p)->p_flag & P_WEXIT) && (p)->p_lock == 0) \ wakeup(&(p)->p_lock); \ } while (0) #define PROC_ASSERT_NOT_HELD(p) do { \ KASSERT((p)->p_lock == 0, ("process held")); \ } while (0) /* Check whether a thread is safe to be swapped out. */ #define thread_safetoswapout(td) (TD_IS_SLEEPING(td) || TD_IS_SUSPENDED(td)) /* Control whether or not it is safe for curthread to sleep. */ #define THREAD_NO_SLEEPING() do { \ KASSERT(!(curthread->td_pflags & TDP_NOSLEEPING), \ ("nested no sleeping")); \ curthread->td_pflags |= TDP_NOSLEEPING; \ } while (0) #define THREAD_SLEEPING_OK() do { \ KASSERT((curthread->td_pflags & TDP_NOSLEEPING), \ ("nested sleeping ok")); \ curthread->td_pflags &= ~TDP_NOSLEEPING; \ } while (0) #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; extern u_long pidhash; #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; extern u_long pgrphash; extern struct sx allproc_lock; extern struct sx proctree_lock; extern struct mtx ppeers_lock; extern struct proc proc0; /* Process slot for swapper. */ extern struct thread thread0; /* Primary thread in proc0. */ extern struct vmspace vmspace0; /* VM space for proc0. */ extern int hogticks; /* Limit on kernel cpu hogs. */ extern int lastpid; extern int nprocs, maxproc; /* Current and max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); extern struct proclist allproc; /* List of all processes. */ extern struct proclist zombproc; /* List of zombie processes. */ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ extern struct uma_zone *proc_zone; struct proc *pfind(pid_t); /* Find process by id. */ struct pgrp *pgfind(pid_t); /* Find process group by id. */ struct proc *zpfind(pid_t); /* Find zombie process by id. */ void ast(struct trapframe *framep); struct thread *choosethread(void); int cr_cansignal(struct ucred *cred, struct proc *proc, int signum); int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess); int enterthispgrp(struct proc *p, struct pgrp *pgrp); void faultin(struct proc *p); void fixjobc(struct proc *p, struct pgrp *pgrp, int entering); int fork1(struct thread *, int, int, struct proc **); void fork_exit(void (*)(void *, struct trapframe *), void *, struct trapframe *); void fork_return(struct thread *, struct trapframe *); int inferior(struct proc *p); void kick_proc0(void); int leavepgrp(struct proc *p); int maybe_preempt(struct thread *td); void mi_switch(int flags, struct thread *newtd); int p_candebug(struct thread *td, struct proc *p); int p_cansee(struct thread *td, struct proc *p); int p_cansched(struct thread *td, struct proc *p); int p_cansignal(struct thread *td, struct proc *p, int signum); int p_canwait(struct thread *td, struct proc *p); struct pargs *pargs_alloc(int len); void pargs_drop(struct pargs *pa); void pargs_free(struct pargs *pa); void pargs_hold(struct pargs *pa); void procinit(void); +void proc_linkup0(struct proc *p, struct thread *td); void proc_linkup(struct proc *p, struct thread *td); void proc_reparent(struct proc *child, struct proc *newparent); struct pstats *pstats_alloc(void); void pstats_fork(struct pstats *src, struct pstats *dst); void pstats_free(struct pstats *ps); int securelevel_ge(struct ucred *cr, int level); int securelevel_gt(struct ucred *cr, int level); void sessrele(struct session *); void setrunnable(struct thread *); void setsugid(struct proc *p); int sigonstack(size_t sp); void sleepinit(void); void stopevent(struct proc *, u_int, u_int); void threadinit(void); void cpu_idle(void); extern void (*cpu_idle_hook)(void); /* Hook to machdep CPU idler. */ void cpu_switch(struct thread *, struct thread *, struct mtx *); void cpu_throw(struct thread *, struct thread *) __dead2; void unsleep(struct thread *); void userret(struct thread *, struct trapframe *); void cpu_exit(struct thread *); void exit1(struct thread *, int) __dead2; void cpu_fork(struct thread *, struct proc *, struct thread *, int); void cpu_set_fork_handler(struct thread *, void (*)(void *), void *); /* New in KSE. */ #ifdef KSE void kse_unlink(struct thread *); void kseinit(void); void upcall_reap(void); void upcall_remove(struct thread *td); #endif void cpu_set_upcall(struct thread *td, struct thread *td0); void cpu_set_upcall_kse(struct thread *, void (*)(void *), void *, stack_t *); int cpu_set_user_tls(struct thread *, void *tls_base); void cpu_thread_clean(struct thread *); void cpu_thread_exit(struct thread *); void cpu_thread_setup(struct thread *td); void cpu_thread_swapin(struct thread *); void cpu_thread_swapout(struct thread *); struct thread *thread_alloc(void); void thread_continued(struct proc *p); void thread_exit(void) __dead2; int thread_export_context(struct thread *td, int willexit); void thread_free(struct thread *td); void thread_link(struct thread *td, struct proc *p); void thread_reap(void); void thread_signal_add(struct thread *td, ksiginfo_t *); int thread_single(int how); void thread_single_end(void); void thread_stash(struct thread *td); int thread_statclock(int user); void thread_stopped(struct proc *p); void childproc_stopped(struct proc *child, int reason); void childproc_continued(struct proc *child); void childproc_exited(struct proc *child); int thread_suspend_check(int how); void thread_suspend_switch(struct thread *); void thread_suspend_one(struct thread *td); struct thread *thread_switchout(struct thread *td, int flags, struct thread *newtd); void thread_unlink(struct thread *td); void thread_unsuspend(struct proc *p); void thread_unsuspend_one(struct thread *td); void thread_unthread(struct thread *td); int thread_userret(struct thread *td, struct trapframe *frame); void thread_user_enter(struct thread *td); void thread_wait(struct proc *p); struct thread *thread_find(struct proc *p, lwpid_t tid); void thr_exit1(void); #endif /* _KERNEL */ #endif /* !_SYS_PROC_H_ */ Index: head/sys/sys/signalvar.h =================================================================== --- head/sys/sys/signalvar.h (revision 173360) +++ head/sys/sys/signalvar.h (revision 173361) @@ -1,363 +1,364 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)signalvar.h 8.6 (Berkeley) 2/19/95 * $FreeBSD$ */ #ifndef _SYS_SIGNALVAR_H_ #define _SYS_SIGNALVAR_H_ #include #include #include #include /* * Kernel signal definitions and data structures, * not exported to user programs. */ /* * Logical process signal actions and state, needed only within the process * The mapping between sigacts and proc structures is 1:1 except for rfork() * processes masquerading as threads which use one structure for the whole * group. All members are locked by the included mutex. The reference count * and mutex must be last for the bcopy in sigacts_copy() to work. */ struct sigacts { sig_t ps_sigact[_SIG_MAXSIG]; /* Disposition of signals. */ sigset_t ps_catchmask[_SIG_MAXSIG]; /* Signals to be blocked. */ sigset_t ps_sigonstack; /* Signals to take on sigstack. */ sigset_t ps_sigintr; /* Signals that interrupt syscalls. */ sigset_t ps_sigreset; /* Signals that reset when caught. */ sigset_t ps_signodefer; /* Signals not masked while handled. */ sigset_t ps_siginfo; /* Signals that want SA_SIGINFO args. */ sigset_t ps_sigignore; /* Signals being ignored. */ sigset_t ps_sigcatch; /* Signals being caught by user. */ sigset_t ps_freebsd4; /* signals using freebsd4 ucontext. */ sigset_t ps_osigset; /* Signals using <= 3.x osigset_t. */ sigset_t ps_usertramp; /* SunOS compat; libc sigtramp. XXX */ int ps_flag; int ps_refcnt; struct mtx ps_mtx; }; #define PS_NOCLDWAIT 0x0001 /* No zombies if child dies */ #define PS_NOCLDSTOP 0x0002 /* No SIGCHLD when children stop. */ #define PS_CLDSIGIGN 0x0004 /* The SIGCHLD handler is SIG_IGN. */ #if defined(_KERNEL) && defined(COMPAT_43) /* * Compatibility. */ typedef struct { struct osigcontext si_sc; int si_signo; int si_code; union sigval si_value; } osiginfo_t; struct osigaction { union { void (*__sa_handler)(int); void (*__sa_sigaction)(int, osiginfo_t *, void *); } __sigaction_u; /* signal handler */ osigset_t sa_mask; /* signal mask to apply */ int sa_flags; /* see signal options below */ }; typedef void __osiginfohandler_t(int, osiginfo_t *, void *); #endif /* _KERNEL && COMPAT_43 */ /* additional signal action values, used only temporarily/internally */ #define SIG_CATCH ((__sighandler_t *)2) #define SIG_HOLD ((__sighandler_t *)3) /* * get signal action for process and signal; currently only for current process */ #define SIGACTION(p, sig) (p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) /* * sigset_t manipulation macros */ #define SIGADDSET(set, signo) \ ((set).__bits[_SIG_WORD(signo)] |= _SIG_BIT(signo)) #define SIGDELSET(set, signo) \ ((set).__bits[_SIG_WORD(signo)] &= ~_SIG_BIT(signo)) #define SIGEMPTYSET(set) \ do { \ int __i; \ for (__i = 0; __i < _SIG_WORDS; __i++) \ (set).__bits[__i] = 0; \ } while (0) #define SIGFILLSET(set) \ do { \ int __i; \ for (__i = 0; __i < _SIG_WORDS; __i++) \ (set).__bits[__i] = ~0U; \ } while (0) #define SIGISMEMBER(set, signo) \ ((set).__bits[_SIG_WORD(signo)] & _SIG_BIT(signo)) #define SIGISEMPTY(set) (__sigisempty(&(set))) #define SIGNOTEMPTY(set) (!__sigisempty(&(set))) #define SIGSETEQ(set1, set2) (__sigseteq(&(set1), &(set2))) #define SIGSETNEQ(set1, set2) (!__sigseteq(&(set1), &(set2))) #define SIGSETOR(set1, set2) \ do { \ int __i; \ for (__i = 0; __i < _SIG_WORDS; __i++) \ (set1).__bits[__i] |= (set2).__bits[__i]; \ } while (0) #define SIGSETAND(set1, set2) \ do { \ int __i; \ for (__i = 0; __i < _SIG_WORDS; __i++) \ (set1).__bits[__i] &= (set2).__bits[__i]; \ } while (0) #define SIGSETNAND(set1, set2) \ do { \ int __i; \ for (__i = 0; __i < _SIG_WORDS; __i++) \ (set1).__bits[__i] &= ~(set2).__bits[__i]; \ } while (0) #define SIGSETLO(set1, set2) ((set1).__bits[0] = (set2).__bits[0]) #define SIGSETOLD(set, oset) ((set).__bits[0] = (oset)) #define SIG_CANTMASK(set) \ SIGDELSET(set, SIGKILL), SIGDELSET(set, SIGSTOP) #define SIG_STOPSIGMASK(set) \ SIGDELSET(set, SIGSTOP), SIGDELSET(set, SIGTSTP), \ SIGDELSET(set, SIGTTIN), SIGDELSET(set, SIGTTOU) #define SIG_CONTSIGMASK(set) \ SIGDELSET(set, SIGCONT) #define sigcantmask (sigmask(SIGKILL) | sigmask(SIGSTOP)) #define SIG2OSIG(sig, osig) (osig = (sig).__bits[0]) #define OSIG2SIG(osig, sig) SIGEMPTYSET(sig); (sig).__bits[0] = osig static __inline int __sigisempty(sigset_t *set) { int i; for (i = 0; i < _SIG_WORDS; i++) { if (set->__bits[i]) return (0); } return (1); } static __inline int __sigseteq(sigset_t *set1, sigset_t *set2) { int i; for (i = 0; i < _SIG_WORDS; i++) { if (set1->__bits[i] != set2->__bits[i]) return (0); } return (1); } struct osigevent { int sigev_notify; /* Notification type */ union { int __sigev_signo; /* Signal number */ int __sigev_notify_kqueue; } __sigev_u; union sigval sigev_value; /* Signal value */ }; typedef struct ksiginfo { TAILQ_ENTRY(ksiginfo) ksi_link; siginfo_t ksi_info; int ksi_flags; struct sigqueue *ksi_sigq; } ksiginfo_t; #define ksi_signo ksi_info.si_signo #define ksi_errno ksi_info.si_errno #define ksi_code ksi_info.si_code #define ksi_pid ksi_info.si_pid #define ksi_uid ksi_info.si_uid #define ksi_status ksi_info.si_status #define ksi_addr ksi_info.si_addr #define ksi_value ksi_info.si_value #define ksi_band ksi_info.si_band #define ksi_trapno ksi_info.si_trapno #define ksi_overrun ksi_info.si_overrun #define ksi_timerid ksi_info.si_timerid #define ksi_mqd ksi_info.si_mqd /* bits for ksi_flags */ #define KSI_TRAP 0x01 /* Generated by trap. */ #define KSI_EXT 0x02 /* Externally managed ksi. */ #define KSI_INS 0x04 /* Directly insert ksi, not the copy */ #define KSI_COPYMASK KSI_TRAP #define KSI_ONQ(ksi) ((ksi)->ksi_sigq != NULL) typedef struct sigqueue { sigset_t sq_signals; /* All pending signals. */ sigset_t sq_kill; /* Legacy depth 1 queue. */ TAILQ_HEAD(, ksiginfo) sq_list;/* Queued signal info. */ struct proc *sq_proc; int sq_flags; } sigqueue_t; /* Flags for ksi_flags */ #define SQ_INIT 0x01 #ifdef _KERNEL /* Return nonzero if process p has an unmasked pending signal. */ #define SIGPENDING(td) \ (!SIGISEMPTY((td)->td_siglist) && \ !sigsetmasked(&(td)->td_siglist, &(td)->td_sigmask)) /* * Return the value of the pseudo-expression ((*set & ~*mask) != 0). This * is an optimized version of SIGISEMPTY() on a temporary variable * containing SIGSETNAND(*set, *mask). */ static __inline int sigsetmasked(sigset_t *set, sigset_t *mask) { int i; for (i = 0; i < _SIG_WORDS; i++) { if (set->__bits[i] & ~mask->__bits[i]) return (0); } return (1); } #define ksiginfo_init(ksi) \ do { \ bzero(ksi, sizeof(ksiginfo_t)); \ } while(0) #define ksiginfo_init_trap(ksi) \ do { \ ksiginfo_t *kp = ksi; \ bzero(kp, sizeof(ksiginfo_t)); \ kp->ksi_flags |= KSI_TRAP; \ } while(0) static __inline void ksiginfo_copy(ksiginfo_t *src, ksiginfo_t *dst) { (dst)->ksi_info = src->ksi_info; (dst)->ksi_flags = (src->ksi_flags & KSI_COPYMASK); } struct pgrp; struct thread; struct proc; struct sigio; struct mtx; extern int sugid_coredump; /* Sysctl variable kern.sugid_coredump */ extern struct mtx sigio_lock; +extern int kern_logsigexit; /* Sysctl variable kern.logsigexit */ /* * Lock the pointers for a sigio object in the underlying objects of * a file descriptor. */ #define SIGIO_LOCK() mtx_lock(&sigio_lock) #define SIGIO_TRYLOCK() mtx_trylock(&sigio_lock) #define SIGIO_UNLOCK() mtx_unlock(&sigio_lock) #define SIGIO_LOCKED() mtx_owned(&sigio_lock) #define SIGIO_ASSERT(type) mtx_assert(&sigio_lock, type) /* * Machine-independent functions: */ int cursig(struct thread *td); void execsigs(struct proc *p); void gsignal(int pgid, int sig); void killproc(struct proc *p, char *why); void pgsigio(struct sigio **, int signum, int checkctty); void pgsignal(struct pgrp *pgrp, int sig, int checkctty); void postsig(int sig); void psignal(struct proc *p, int sig); int psignal_event(struct proc *p, struct sigevent *, ksiginfo_t *); struct sigacts *sigacts_alloc(void); void sigacts_copy(struct sigacts *dest, struct sigacts *src); void sigacts_free(struct sigacts *ps); struct sigacts *sigacts_hold(struct sigacts *ps); int sigacts_shared(struct sigacts *ps); void sigexit(struct thread *td, int signum) __dead2; int sig_ffs(sigset_t *set); void siginit(struct proc *p); void signotify(struct thread *td); int tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi); void trapsignal(struct thread *td, ksiginfo_t *); int ptracestop(struct thread *td, int sig); ksiginfo_t * ksiginfo_alloc(int); void ksiginfo_free(ksiginfo_t *); void sigqueue_init(struct sigqueue *queue, struct proc *p); void sigqueue_flush(struct sigqueue *queue); void sigqueue_delete_proc(struct proc *p, int sig); void sigqueue_delete_set(struct sigqueue *queue, sigset_t *set); void sigqueue_delete(struct sigqueue *queue, int sig); void sigqueue_move_set(struct sigqueue *src, sigqueue_t *dst, sigset_t *); int sigqueue_get(struct sigqueue *queue, int sig, ksiginfo_t *info); int sigqueue_add(struct sigqueue *queue, int sig, ksiginfo_t *info); void sigqueue_collect_set(struct sigqueue *queue, sigset_t *set); void sigqueue_move(struct sigqueue *, struct sigqueue *, int sig); void sigqueue_delete_set_proc(struct proc *, sigset_t *); void sigqueue_delete_stopmask_proc(struct proc *); void sigqueue_take(ksiginfo_t *ksi); int kern_sigtimedwait(struct thread *, sigset_t, ksiginfo_t *, struct timespec *); /* * Machine-dependent functions: */ void sendsig(sig_t, ksiginfo_t *, sigset_t *retmask); #endif /* _KERNEL */ #endif /* !_SYS_SIGNALVAR_H_ */ Index: head/sys/vm/pmap.h =================================================================== --- head/sys/vm/pmap.h (revision 173360) +++ head/sys/vm/pmap.h (revision 173361) @@ -1,138 +1,138 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.h 8.1 (Berkeley) 6/11/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Author: Avadis Tevanian, Jr. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Machine address mapping definitions -- machine-independent * section. [For machine-dependent section, see "machine/pmap.h".] */ #ifndef _PMAP_VM_ #define _PMAP_VM_ /* * Each machine dependent implementation is expected to * keep certain statistics. They may do this anyway they * so choose, but are expected to return the statistics * in the following structure. */ struct pmap_statistics { long resident_count; /* # of pages mapped (total) */ long wired_count; /* # of pages wired */ }; typedef struct pmap_statistics *pmap_statistics_t; #include #ifdef _KERNEL struct proc; struct thread; /* * Updates to kernel_vm_end are synchronized by the kernel_map's system mutex. */ extern vm_offset_t kernel_vm_end; void pmap_change_wiring(pmap_t, vm_offset_t, boolean_t); void pmap_clear_modify(vm_page_t m); void pmap_clear_reference(vm_page_t m); void pmap_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); void pmap_copy_page(vm_page_t, vm_page_t); void pmap_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, boolean_t); void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot); void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot); vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va); vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot); void pmap_growkernel(vm_offset_t); void pmap_init(void); boolean_t pmap_is_modified(vm_page_t m); boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t va); boolean_t pmap_ts_referenced(vm_page_t m); vm_offset_t pmap_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int); void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size); boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m); void pmap_page_init(vm_page_t m); -void pmap_pinit(pmap_t); +int pmap_pinit(pmap_t); void pmap_pinit0(pmap_t); void pmap_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); void pmap_qenter(vm_offset_t, vm_page_t *, int); void pmap_qremove(vm_offset_t, int); void pmap_release(pmap_t); void pmap_remove(pmap_t, vm_offset_t, vm_offset_t); void pmap_remove_all(vm_page_t m); void pmap_remove_pages(pmap_t); void pmap_remove_write(vm_page_t m); void pmap_zero_page(vm_page_t); void pmap_zero_page_area(vm_page_t, int off, int size); void pmap_zero_page_idle(vm_page_t); int pmap_mincore(pmap_t pmap, vm_offset_t addr); void pmap_activate(struct thread *td); vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size); #define pmap_resident_count(pm) ((pm)->pm_stats.resident_count) #define pmap_wired_count(pm) ((pm)->pm_stats.wired_count) #endif /* _KERNEL */ #endif /* _PMAP_VM_ */ Index: head/sys/vm/vm_extern.h =================================================================== --- head/sys/vm/vm_extern.h (revision 173360) +++ head/sys/vm/vm_extern.h (revision 173361) @@ -1,100 +1,100 @@ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_extern.h 8.2 (Berkeley) 1/12/94 * $FreeBSD$ */ #ifndef _VM_EXTERN_H_ #define _VM_EXTERN_H_ struct buf; struct proc; struct vmspace; struct vmtotal; struct mount; struct vnode; #ifdef _KERNEL #ifdef TYPEDEF_FOR_UAP int getpagesize(struct thread *, void *, int *); int madvise(struct thread *, void *, int *); int mincore(struct thread *, void *, int *); int mprotect(struct thread *, void *, int *); int msync(struct thread *, void *, int *); int munmap(struct thread *, void *, int *); int obreak(struct thread *, void *, int *); int sbrk(struct thread *, void *, int *); int sstk(struct thread *, void *, int *); int swapon(struct thread *, void *, int *); #endif /* TYPEDEF_FOR_UAP */ int kernacc(void *, int, int); vm_offset_t kmem_alloc(vm_map_t, vm_size_t); vm_offset_t kmem_alloc_nofault(vm_map_t, vm_size_t); vm_offset_t kmem_alloc_wait(vm_map_t, vm_size_t); void kmem_free(vm_map_t, vm_offset_t, vm_size_t); void kmem_free_wakeup(vm_map_t, vm_offset_t, vm_size_t); void kmem_init(vm_offset_t, vm_offset_t); vm_offset_t kmem_malloc(vm_map_t, vm_size_t, boolean_t); vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t); void swapout_procs(int); int useracc(void *, int, int); int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int); void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t); void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t); int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t); -void vm_forkproc(struct thread *, struct proc *, struct thread *, int); +int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int); void vm_waitproc(struct proc *); int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, objtype_t, void *, vm_ooffset_t); void vm_set_page_size(void); struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t); struct vmspace *vmspace_fork(struct vmspace *); -void vmspace_exec(struct proc *, vm_offset_t, vm_offset_t); -void vmspace_unshare(struct proc *); +int vmspace_exec(struct proc *, vm_offset_t, vm_offset_t); +int vmspace_unshare(struct proc *); void vmspace_exit(struct thread *); struct vmspace *vmspace_acquire_ref(struct proc *); void vmspace_free(struct vmspace *); void vmspace_exitfree(struct proc *); void vnode_pager_setsize(struct vnode *, vm_ooffset_t); int vslock(void *, size_t); void vsunlock(void *, size_t); void vm_object_print(/* db_expr_t */ long, boolean_t, /* db_expr_t */ long, char *); int vm_fault_quick(caddr_t v, int prot); struct sf_buf *vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset); void vm_imgact_unmap_page(struct sf_buf *sf); void vm_thread_dispose(struct thread *td); void vm_thread_dispose_altkstack(struct thread *td); -void vm_thread_new(struct thread *td, int pages); -void vm_thread_new_altkstack(struct thread *td, int pages); +int vm_thread_new(struct thread *td, int pages); +int vm_thread_new_altkstack(struct thread *td, int pages); void vm_thread_swapin(struct thread *td); void vm_thread_swapout(struct thread *td); #endif /* _KERNEL */ #endif /* !_VM_EXTERN_H_ */ Index: head/sys/vm/vm_glue.c =================================================================== --- head/sys/vm/vm_glue.c (revision 173360) +++ head/sys/vm/vm_glue.c (revision 173361) @@ -1,1049 +1,1060 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include "opt_kstack_pages.h" #include "opt_kstack_max_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern int maxslp; /* * System initialization * * Note: proc0 from proc.h */ static void vm_init_limits(void *); SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0) /* * THIS MUST BE THE LAST INITIALIZATION ITEM!!! * * Note: run scheduling should be divorced from the vm system. */ static void scheduler(void *); SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_ANY, scheduler, NULL) #ifndef NO_SWAPPING static int swapout(struct proc *); static void swapclear(struct proc *); #endif static volatile int proc0_rescan; /* * MPSAFE * * WARNING! This code calls vm_map_check_protection() which only checks * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. In most cases * just checking the vm_map_entry is sufficient within the kernel's address * space. */ int kernacc(addr, len, rw) void *addr; int len, rw; { boolean_t rv; vm_offset_t saddr, eaddr; vm_prot_t prot; KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to kernacc (%x)\n", rw)); if ((vm_offset_t)addr + len > kernel_map->max_offset || (vm_offset_t)addr + len < (vm_offset_t)addr) return (FALSE); prot = rw; saddr = trunc_page((vm_offset_t)addr); eaddr = round_page((vm_offset_t)addr + len); vm_map_lock_read(kernel_map); rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); vm_map_unlock_read(kernel_map); return (rv == TRUE); } /* * MPSAFE * * WARNING! This code calls vm_map_check_protection() which only checks * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. vmapbuf(), * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be * used in conjuction with this call. */ int useracc(addr, len, rw) void *addr; int len, rw; { boolean_t rv; vm_prot_t prot; vm_map_t map; KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to useracc (%x)\n", rw)); prot = rw; map = &curproc->p_vmspace->vm_map; if ((vm_offset_t)addr + len > vm_map_max(map) || (vm_offset_t)addr + len < (vm_offset_t)addr) { return (FALSE); } vm_map_lock_read(map); rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), prot); vm_map_unlock_read(map); return (rv == TRUE); } int vslock(void *addr, size_t len) { vm_offset_t end, last, start; vm_size_t npages; int error; last = (vm_offset_t)addr + len; start = trunc_page((vm_offset_t)addr); end = round_page(last); if (last < (vm_offset_t)addr || end < (vm_offset_t)addr) return (EINVAL); npages = atop(end - start); if (npages > vm_page_max_wired) return (ENOMEM); PROC_LOCK(curproc); if (ptoa(npages + pmap_wired_count(vm_map_pmap(&curproc->p_vmspace->vm_map))) > lim_cur(curproc, RLIMIT_MEMLOCK)) { PROC_UNLOCK(curproc); return (ENOMEM); } PROC_UNLOCK(curproc); #if 0 /* * XXX - not yet * * The limit for transient usage of wired pages should be * larger than for "permanent" wired pages (mlock()). * * Also, the sysctl code, which is the only present user * of vslock(), does a hard loop on EAGAIN. */ if (npages + cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); #endif error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); /* * Return EFAULT on error to match copy{in,out}() behaviour * rather than returning ENOMEM like mlock() would. */ return (error == KERN_SUCCESS ? 0 : EFAULT); } void vsunlock(void *addr, size_t len) { /* Rely on the parameter sanity checks performed by vslock(). */ (void)vm_map_unwire(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); } /* * Pin the page contained within the given object at the given offset. If the * page is not resident, allocate and load it using the given object's pager. * Return the pinned page if successful; otherwise, return NULL. */ static vm_page_t vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset) { vm_page_t m, ma[1]; vm_pindex_t pindex; int rv; VM_OBJECT_LOCK(object); pindex = OFF_TO_IDX(offset); m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { ma[0] = m; rv = vm_pager_get_pages(object, ma, 1, 0); m = vm_page_lookup(object, pindex); if (m == NULL) goto out; if (m->valid == 0 || rv != VM_PAGER_OK) { vm_page_lock_queues(); vm_page_free(m); vm_page_unlock_queues(); m = NULL; goto out; } } vm_page_lock_queues(); vm_page_hold(m); vm_page_unlock_queues(); vm_page_wakeup(m); out: VM_OBJECT_UNLOCK(object); return (m); } /* * Return a CPU private mapping to the page at the given offset within the * given object. The page is pinned before it is mapped. */ struct sf_buf * vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset) { vm_page_t m; m = vm_imgact_hold_page(object, offset); if (m == NULL) return (NULL); sched_pin(); return (sf_buf_alloc(m, SFB_CPUPRIVATE)); } /* * Destroy the given CPU private mapping and unpin the page that it mapped. */ void vm_imgact_unmap_page(struct sf_buf *sf) { vm_page_t m; m = sf_buf_page(sf); sf_buf_free(sf); sched_unpin(); vm_page_lock_queues(); vm_page_unhold(m); vm_page_unlock_queues(); } #ifndef KSTACK_MAX_PAGES #define KSTACK_MAX_PAGES 32 #endif /* * Create the kernel stack (including pcb for i386) for a new thread. * This routine directly affects the fork perf for a process and * create performance for a thread. */ -void +int vm_thread_new(struct thread *td, int pages) { vm_object_t ksobj; vm_offset_t ks; vm_page_t m, ma[KSTACK_MAX_PAGES]; int i; /* Bounds check */ if (pages <= 1) pages = KSTACK_PAGES; else if (pages > KSTACK_MAX_PAGES) pages = KSTACK_MAX_PAGES; /* * Allocate an object for the kstack. */ ksobj = vm_object_allocate(OBJT_DEFAULT, pages); - td->td_kstack_obj = ksobj; /* * Get a kernel virtual address for this thread's kstack. */ ks = kmem_alloc_nofault(kernel_map, (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); - if (ks == 0) - panic("vm_thread_new: kstack allocation failed"); + if (ks == 0) { + printf("vm_thread_new: kstack allocation failed\n"); + vm_object_deallocate(ksobj); + return (0); + } + if (KSTACK_GUARD_PAGES != 0) { pmap_qremove(ks, KSTACK_GUARD_PAGES); ks += KSTACK_GUARD_PAGES * PAGE_SIZE; } + td->td_kstack_obj = ksobj; td->td_kstack = ks; /* * Knowing the number of pages allocated is useful when you * want to deallocate them. */ td->td_kstack_pages = pages; /* * For the length of the stack, link in a real page of ram for each * page of stack. */ VM_OBJECT_LOCK(ksobj); for (i = 0; i < pages; i++) { /* * Get a kernel stack page. */ m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED); ma[i] = m; m->valid = VM_PAGE_BITS_ALL; } VM_OBJECT_UNLOCK(ksobj); pmap_qenter(ks, ma, pages); + return (1); } /* * Dispose of a thread's kernel stack. */ void vm_thread_dispose(struct thread *td) { vm_object_t ksobj; vm_offset_t ks; vm_page_t m; int i, pages; pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; ks = td->td_kstack; pmap_qremove(ks, pages); VM_OBJECT_LOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("vm_thread_dispose: kstack already missing?"); vm_page_lock_queues(); vm_page_unwire(m, 0); vm_page_free(m); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(ksobj); vm_object_deallocate(ksobj); kmem_free(kernel_map, ks - (KSTACK_GUARD_PAGES * PAGE_SIZE), (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); + td->td_kstack = 0; } /* * Allow a thread's kernel stack to be paged out. */ void vm_thread_swapout(struct thread *td) { vm_object_t ksobj; vm_page_t m; int i, pages; cpu_thread_swapout(td); pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; pmap_qremove(td->td_kstack, pages); VM_OBJECT_LOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("vm_thread_swapout: kstack already missing?"); vm_page_lock_queues(); vm_page_dirty(m); vm_page_unwire(m, 0); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(ksobj); } /* * Bring the kernel stack for a specified thread back in. */ void vm_thread_swapin(struct thread *td) { vm_object_t ksobj; vm_page_t m, ma[KSTACK_MAX_PAGES]; int i, pages, rv; pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; VM_OBJECT_LOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(ksobj, &m, 1, 0); if (rv != VM_PAGER_OK) panic("vm_thread_swapin: cannot get kstack for proc: %d", td->td_proc->p_pid); m = vm_page_lookup(ksobj, i); m->valid = VM_PAGE_BITS_ALL; } ma[i] = m; vm_page_lock_queues(); vm_page_wire(m); vm_page_unlock_queues(); vm_page_wakeup(m); } VM_OBJECT_UNLOCK(ksobj); pmap_qenter(td->td_kstack, ma, pages); cpu_thread_swapin(td); } /* * Set up a variable-sized alternate kstack. */ -void +int vm_thread_new_altkstack(struct thread *td, int pages) { td->td_altkstack = td->td_kstack; td->td_altkstack_obj = td->td_kstack_obj; td->td_altkstack_pages = td->td_kstack_pages; - vm_thread_new(td, pages); + return (vm_thread_new(td, pages)); } /* * Restore the original kstack. */ void vm_thread_dispose_altkstack(struct thread *td) { vm_thread_dispose(td); td->td_kstack = td->td_altkstack; td->td_kstack_obj = td->td_altkstack_obj; td->td_kstack_pages = td->td_altkstack_pages; td->td_altkstack = 0; td->td_altkstack_obj = NULL; td->td_altkstack_pages = 0; } /* * Implement fork's actions on an address space. * Here we arrange for the address space to be copied or referenced, * allocate a user struct (pcb and kernel stack), then call the * machine-dependent layer to fill those in and make the new process * ready to run. The new process is set up so that it returns directly * to user mode to avoid stack copying and relocation problems. */ -void -vm_forkproc(td, p2, td2, flags) +int +vm_forkproc(td, p2, td2, vm2, flags) struct thread *td; struct proc *p2; struct thread *td2; + struct vmspace *vm2; int flags; { struct proc *p1 = td->td_proc; + int error; if ((flags & RFPROC) == 0) { /* * Divorce the memory, if it is shared, essentially * this changes shared memory amongst threads, into * COW locally. */ if ((flags & RFMEM) == 0) { if (p1->p_vmspace->vm_refcnt > 1) { - vmspace_unshare(p1); + error = vmspace_unshare(p1); + if (error) + return (error); } } cpu_fork(td, p2, td2, flags); - return; + return (0); } if (flags & RFMEM) { p2->p_vmspace = p1->p_vmspace; atomic_add_int(&p1->p_vmspace->vm_refcnt, 1); } while (vm_page_count_severe()) { VM_WAIT; } if ((flags & RFMEM) == 0) { - p2->p_vmspace = vmspace_fork(p1->p_vmspace); + p2->p_vmspace = vm2; if (p1->p_vmspace->vm_shm) shmfork(p1, p2); } /* * cpu_fork will copy and update the pcb, set up the kernel stack, * and make the child ready to run. */ cpu_fork(td, p2, td2, flags); + return (0); } /* * Called after process has been wait(2)'ed apon and is being reaped. * The idea is to reclaim resources that we could not reclaim while * the process was still executing. */ void vm_waitproc(p) struct proc *p; { vmspace_exitfree(p); /* and clean-out the vmspace */ } /* * Set default limits for VM system. * Called for proc 0, and then inherited by all others. * * XXX should probably act directly on proc0. */ static void vm_init_limits(udata) void *udata; { struct proc *p = udata; struct plimit *limp; int rss_limit; /* * Set up the initial limits on process VM. Set the maximum resident * set size to be half of (reasonably) available memory. Since this * is a soft limit, it comes into effect only when the system is out * of memory - half of main memory helps to favor smaller processes, * and reduces thrashing of the object cache. */ limp = p->p_limit; limp->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; limp->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; limp->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; limp->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; /* limit the limit to no less than 2MB */ rss_limit = max(cnt.v_free_count, 512); limp->pl_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit); limp->pl_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY; } void faultin(p) struct proc *p; { #ifdef NO_SWAPPING PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_INMEM) == 0) panic("faultin: proc swapped out with NO_SWAPPING!"); #else /* !NO_SWAPPING */ struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); /* * If another process is swapping in this process, * just wait until it finishes. */ if (p->p_flag & P_SWAPPINGIN) { while (p->p_flag & P_SWAPPINGIN) msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0); return; } if ((p->p_flag & P_INMEM) == 0) { /* * Don't let another thread swap process p out while we are * busy swapping it in. */ ++p->p_lock; p->p_flag |= P_SWAPPINGIN; PROC_UNLOCK(p); /* * We hold no lock here because the list of threads * can not change while all threads in the process are * swapped out. */ FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapin(td); PROC_LOCK(p); PROC_SLOCK(p); swapclear(p); p->p_swtick = ticks; PROC_SUNLOCK(p); wakeup(&p->p_flag); /* Allow other threads to swap p out now. */ --p->p_lock; } #endif /* NO_SWAPPING */ } /* * This swapin algorithm attempts to swap-in processes only if there * is enough space for them. Of course, if a process waits for a long * time, it will be swapped in anyway. * * XXXKSE - process with the thread with highest priority counts.. * * Giant is held on entry. */ /* ARGSUSED*/ static void scheduler(dummy) void *dummy; { struct proc *p; struct thread *td; struct proc *pp; int slptime; int swtime; int ppri; int pri; mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); mtx_unlock(&Giant); loop: if (vm_page_count_min()) { VM_WAIT; thread_lock(&thread0); proc0_rescan = 0; thread_unlock(&thread0); goto loop; } pp = NULL; ppri = INT_MIN; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) { PROC_UNLOCK(p); continue; } swtime = (ticks - p->p_swtick) / hz; PROC_SLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { /* * An otherwise runnable thread of a process * swapped out has only the TDI_SWAPPED bit set. * */ thread_lock(td); if (td->td_inhibitors == TDI_SWAPPED) { slptime = (ticks - td->td_slptick) / hz; pri = swtime + slptime; if ((td->td_flags & TDF_SWAPINREQ) == 0) pri -= p->p_nice * 8; /* * if this thread is higher priority * and there is enough space, then select * this process instead of the previous * selection. */ if (pri > ppri) { pp = p; ppri = pri; } } thread_unlock(td); } PROC_SUNLOCK(p); PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); /* * Nothing to do, back to sleep. */ if ((p = pp) == NULL) { thread_lock(&thread0); if (!proc0_rescan) { TD_SET_IWAIT(&thread0); mi_switch(SW_VOL, NULL); } proc0_rescan = 0; thread_unlock(&thread0); goto loop; } PROC_LOCK(p); /* * Another process may be bringing or may have already * brought this process in while we traverse all threads. * Or, this process may even be being swapped out again. */ if (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) { PROC_UNLOCK(p); thread_lock(&thread0); proc0_rescan = 0; thread_unlock(&thread0); goto loop; } /* * We would like to bring someone in. (only if there is space). * [What checks the space? ] */ faultin(p); PROC_UNLOCK(p); thread_lock(&thread0); proc0_rescan = 0; thread_unlock(&thread0); goto loop; } void kick_proc0(void) { struct thread *td = &thread0; /* XXX This will probably cause a LOR in some cases */ thread_lock(td); if (TD_AWAITING_INTR(td)) { CTR2(KTR_INTR, "%s: sched_add %d", __func__, 0); TD_CLR_IWAIT(td); sched_add(td, SRQ_INTR); } else { proc0_rescan = 1; CTR2(KTR_INTR, "%s: state %d", __func__, td->td_state); } thread_unlock(td); } #ifndef NO_SWAPPING /* * Swap_idle_threshold1 is the guaranteed swapped in time for a process */ static int swap_idle_threshold1 = 2; SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW, &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process"); /* * Swap_idle_threshold2 is the time that a process can be idle before * it will be swapped out, if idle swapping is enabled. */ static int swap_idle_threshold2 = 10; SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW, &swap_idle_threshold2, 0, "Time before a process will be swapped out"); /* * Swapout is driven by the pageout daemon. Very simple, we find eligible * procs and swap out their stacks. We try to always "swap" at least one * process in case we need the room for a swapin. * If any procs have been sleeping/stopped for at least maxslp seconds, * they are swapped. Else, we swap the longest-sleeping or stopped process, * if any, otherwise the longest-resident process. */ void swapout_procs(action) int action; { struct proc *p; struct thread *td; int didswap = 0; retry: sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { struct vmspace *vm; int minslptime = 100000; int slptime; /* * Watch out for a process in * creation. It may have no * address space or lock yet. */ if (p->p_state == PRS_NEW) continue; /* * An aio daemon switches its * address space while running. * Perform a quick check whether * a process has P_SYSTEM. */ if ((p->p_flag & P_SYSTEM) != 0) continue; /* * Do not swapout a process that * is waiting for VM data * structures as there is a possible * deadlock. Test this first as * this may block. * * Lock the map until swapout * finishes, or a thread of this * process may attempt to alter * the map. */ vm = vmspace_acquire_ref(p); if (vm == NULL) continue; if (!vm_map_trylock(&vm->vm_map)) goto nextproc1; PROC_LOCK(p); if (p->p_lock != 0 || (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT) ) != 0) { goto nextproc2; } /* * only aiod changes vmspace, however it will be * skipped because of the if statement above checking * for P_SYSTEM */ if ((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) != P_INMEM) goto nextproc2; switch (p->p_state) { default: /* Don't swap out processes in any sort * of 'special' state. */ break; case PRS_NORMAL: PROC_SLOCK(p); /* * do not swapout a realtime process * Check all the thread groups.. */ FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (PRI_IS_REALTIME(td->td_pri_class)) { thread_unlock(td); goto nextproc; } slptime = (ticks - td->td_slptick) / hz; /* * Guarantee swap_idle_threshold1 * time in memory. */ if (slptime < swap_idle_threshold1) { thread_unlock(td); goto nextproc; } /* * Do not swapout a process if it is * waiting on a critical event of some * kind or there is a thread whose * pageable memory may be accessed. * * This could be refined to support * swapping out a thread. */ if ((td->td_priority) < PSOCK || !thread_safetoswapout(td)) { thread_unlock(td); goto nextproc; } /* * If the system is under memory stress, * or if we are swapping * idle processes >= swap_idle_threshold2, * then swap the process out. */ if (((action & VM_SWAP_NORMAL) == 0) && (((action & VM_SWAP_IDLE) == 0) || (slptime < swap_idle_threshold2))) { thread_unlock(td); goto nextproc; } if (minslptime > slptime) minslptime = slptime; thread_unlock(td); } /* * If the pageout daemon didn't free enough pages, * or if this process is idle and the system is * configured to swap proactively, swap it out. */ if ((action & VM_SWAP_NORMAL) || ((action & VM_SWAP_IDLE) && (minslptime > swap_idle_threshold2))) { if (swapout(p) == 0) didswap++; PROC_SUNLOCK(p); PROC_UNLOCK(p); vm_map_unlock(&vm->vm_map); vmspace_free(vm); sx_sunlock(&allproc_lock); goto retry; } nextproc: PROC_SUNLOCK(p); } nextproc2: PROC_UNLOCK(p); vm_map_unlock(&vm->vm_map); nextproc1: vmspace_free(vm); continue; } sx_sunlock(&allproc_lock); /* * If we swapped something out, and another process needed memory, * then wakeup the sched process. */ if (didswap) wakeup(&proc0); } static void swapclear(p) struct proc *p; { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); td->td_flags |= TDF_INMEM; td->td_flags &= ~TDF_SWAPINREQ; TD_CLR_SWAPPED(td); if (TD_CAN_RUN(td)) setrunnable(td); thread_unlock(td); } p->p_flag &= ~(P_SWAPPINGIN|P_SWAPPINGOUT); p->p_flag |= P_INMEM; } static int swapout(p) struct proc *p; { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED | MA_NOTRECURSED); #if defined(SWAP_DEBUG) printf("swapping out %d\n", p->p_pid); #endif /* * The states of this process and its threads may have changed * by now. Assuming that there is only one pageout daemon thread, * this process should still be in memory. */ KASSERT((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) == P_INMEM, ("swapout: lost a swapout race?")); /* * remember the process resident count */ p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace); /* * Check and mark all threads before we proceed. */ p->p_flag &= ~P_INMEM; p->p_flag |= P_SWAPPINGOUT; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!thread_safetoswapout(td)) { thread_unlock(td); swapclear(p); return (EBUSY); } td->td_flags &= ~TDF_INMEM; TD_SET_SWAPPED(td); thread_unlock(td); } td = FIRST_THREAD_IN_PROC(p); ++td->td_ru.ru_nswap; PROC_SUNLOCK(p); PROC_UNLOCK(p); /* * This list is stable because all threads are now prevented from * running. The list is only modified in the context of a running * thread in this process. */ FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapout(td); PROC_LOCK(p); p->p_flag &= ~P_SWAPPINGOUT; PROC_SLOCK(p); p->p_swtick = ticks; return (0); } #endif /* !NO_SWAPPING */ Index: head/sys/vm/vm_map.c =================================================================== --- head/sys/vm/vm_map.c (revision 173360) +++ head/sys/vm/vm_map.c (revision 173361) @@ -1,3439 +1,3456 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory mapping module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual memory maps provide for the mapping, protection, * and sharing of virtual memory objects. In addition, * this module provides for an efficient virtual copy of * memory from one map to another. * * Synchronization is required prior to most operations. * * Maps consist of an ordered doubly-linked list of simple * entries; a single hint is used to speed up lookups. * * Since portions of maps are specified by start/end addresses, * which may not align with existing map entries, all * routines merely "clip" entries to these start/end values. * [That is, an entry is split into two, bordering at a * start or end value.] Note that these clippings may not * always be necessary (as the two resulting entries are then * not changed); however, the clipping is done for convenience. * * As mentioned above, virtual copy operations are performed * by copying VM object references from one map to * another, and then marking both regions as copy-on-write. */ /* * vm_map_startup: * * Initialize the vm_map module. Must be called before * any other vm_map routines. * * Map and entry structures are allocated from the general * purpose memory pool with some exceptions: * * - The kernel map and kmem submap are allocated statically. * - Kernel map entries are allocated out of a static pool. * * These restrictions are necessary since malloc() uses the * maps and requires map entries. */ static struct mtx map_sleep_mtx; static uma_zone_t mapentzone; static uma_zone_t kmapentzone; static uma_zone_t mapzone; static uma_zone_t vmspace_zone; static struct vm_object kmapentobj; static int vmspace_zinit(void *mem, int size, int flags); static void vmspace_zfini(void *mem, int size); static int vm_map_zinit(void *mem, int ize, int flags); static void vm_map_zfini(void *mem, int size); static void _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max); #ifdef INVARIANTS static void vm_map_zdtor(void *mem, int size, void *arg); static void vmspace_zdtor(void *mem, int size, void *arg); #endif /* * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type * stable. */ #define PROC_VMSPACE_LOCK(p) do { } while (0) #define PROC_VMSPACE_UNLOCK(p) do { } while (0) /* * VM_MAP_RANGE_CHECK: [ internal use only ] * * Asserts that the starting and ending region * addresses fall within the valid range of the map. */ #define VM_MAP_RANGE_CHECK(map, start, end) \ { \ if (start < vm_map_min(map)) \ start = vm_map_min(map); \ if (end > vm_map_max(map)) \ end = vm_map_max(map); \ if (start > end) \ start = end; \ } void vm_map_startup(void) { mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF); mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL, #ifdef INVARIANTS vm_map_zdtor, #else NULL, #endif vm_map_zinit, vm_map_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_prealloc(mapzone, MAX_KMAP); kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MTXCLASS | UMA_ZONE_VM); uma_prealloc(kmapentzone, MAX_KMAPENT); mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } static void vmspace_zfini(void *mem, int size) { struct vmspace *vm; vm = (struct vmspace *)mem; - pmap_release(vmspace_pmap(vm)); vm_map_zfini(&vm->vm_map, sizeof(vm->vm_map)); } static int vmspace_zinit(void *mem, int size, int flags) { struct vmspace *vm; vm = (struct vmspace *)mem; + vm->vm_map.pmap = NULL; (void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags); - pmap_pinit(vmspace_pmap(vm)); return (0); } static void vm_map_zfini(void *mem, int size) { vm_map_t map; map = (vm_map_t)mem; mtx_destroy(&map->system_mtx); sx_destroy(&map->lock); } static int vm_map_zinit(void *mem, int size, int flags) { vm_map_t map; map = (vm_map_t)mem; map->nentries = 0; map->size = 0; mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK); sx_init(&map->lock, "user map"); return (0); } #ifdef INVARIANTS static void vmspace_zdtor(void *mem, int size, void *arg) { struct vmspace *vm; vm = (struct vmspace *)mem; vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg); } static void vm_map_zdtor(void *mem, int size, void *arg) { vm_map_t map; map = (vm_map_t)mem; KASSERT(map->nentries == 0, ("map %p nentries == %d on free.", map, map->nentries)); KASSERT(map->size == 0, ("map %p size == %lu on free.", map, (unsigned long)map->size)); } #endif /* INVARIANTS */ /* * Allocate a vmspace structure, including a vm_map and pmap, * and initialize those structures. The refcnt is set to 1. */ struct vmspace * vmspace_alloc(min, max) vm_offset_t min, max; { struct vmspace *vm; vm = uma_zalloc(vmspace_zone, M_WAITOK); + if (vm->vm_map.pmap == NULL && !pmap_pinit(vmspace_pmap(vm))) { + uma_zfree(vmspace_zone, vm); + return (NULL); + } CTR1(KTR_VM, "vmspace_alloc: %p", vm); _vm_map_init(&vm->vm_map, min, max); vm->vm_map.pmap = vmspace_pmap(vm); /* XXX */ vm->vm_refcnt = 1; vm->vm_shm = NULL; vm->vm_swrss = 0; vm->vm_tsize = 0; vm->vm_dsize = 0; vm->vm_ssize = 0; vm->vm_taddr = 0; vm->vm_daddr = 0; vm->vm_maxsaddr = 0; return (vm); } void vm_init2(void) { uma_zone_set_obj(kmapentzone, &kmapentobj, lmin(cnt.v_page_count, (VM_MAX_KERNEL_ADDRESS - KERNBASE) / PAGE_SIZE) / 8 + maxproc * 2 + maxfiles); vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL, #ifdef INVARIANTS vmspace_zdtor, #else NULL, #endif vmspace_zinit, vmspace_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); } static inline void vmspace_dofree(struct vmspace *vm) { CTR1(KTR_VM, "vmspace_free: %p", vm); /* * Make sure any SysV shm is freed, it might not have been in * exit1(). */ shmexit(vm); /* * Lock the map, to wait out all other references to it. * Delete all of the mappings and pages they hold, then call * the pmap module to reclaim anything left. */ (void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset, vm->vm_map.max_offset); + /* + * XXX Comment out the pmap_release call for now. The + * vmspace_zone is marked as UMA_ZONE_NOFREE, and bugs cause + * pmap.resident_count to be != 0 on exit sometimes. + */ +/* pmap_release(vmspace_pmap(vm)); */ uma_zfree(vmspace_zone, vm); } void vmspace_free(struct vmspace *vm) { int refcnt; if (vm->vm_refcnt == 0) panic("vmspace_free: attempt to free already freed vmspace"); do refcnt = vm->vm_refcnt; while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1)); if (refcnt == 1) vmspace_dofree(vm); } void vmspace_exitfree(struct proc *p) { struct vmspace *vm; PROC_VMSPACE_LOCK(p); vm = p->p_vmspace; p->p_vmspace = NULL; PROC_VMSPACE_UNLOCK(p); KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace")); vmspace_free(vm); } void vmspace_exit(struct thread *td) { int refcnt; struct vmspace *vm; struct proc *p; /* * Release user portion of address space. * This releases references to vnodes, * which could cause I/O if the file has been unlinked. * Need to do this early enough that we can still sleep. * * The last exiting process to reach this point releases as * much of the environment as it can. vmspace_dofree() is the * slower fallback in case another process had a temporary * reference to the vmspace. */ p = td->td_proc; vm = p->p_vmspace; atomic_add_int(&vmspace0.vm_refcnt, 1); do { refcnt = vm->vm_refcnt; if (refcnt > 1 && p->p_vmspace != &vmspace0) { /* Switch now since other proc might free vmspace */ PROC_VMSPACE_LOCK(p); p->p_vmspace = &vmspace0; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); } } while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1)); if (refcnt == 1) { if (p->p_vmspace != vm) { /* vmspace not yet freed, switch back */ PROC_VMSPACE_LOCK(p); p->p_vmspace = vm; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); } pmap_remove_pages(vmspace_pmap(vm)); /* Switch now since this proc will free vmspace */ PROC_VMSPACE_LOCK(p); p->p_vmspace = &vmspace0; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); vmspace_dofree(vm); } } /* Acquire reference to vmspace owned by another process. */ struct vmspace * vmspace_acquire_ref(struct proc *p) { struct vmspace *vm; int refcnt; PROC_VMSPACE_LOCK(p); vm = p->p_vmspace; if (vm == NULL) { PROC_VMSPACE_UNLOCK(p); return (NULL); } do { refcnt = vm->vm_refcnt; if (refcnt <= 0) { /* Avoid 0->1 transition */ PROC_VMSPACE_UNLOCK(p); return (NULL); } } while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt + 1)); if (vm != p->p_vmspace) { PROC_VMSPACE_UNLOCK(p); vmspace_free(vm); return (NULL); } PROC_VMSPACE_UNLOCK(p); return (vm); } void _vm_map_lock(vm_map_t map, const char *file, int line) { if (map->system_map) _mtx_lock_flags(&map->system_mtx, 0, file, line); else (void)_sx_xlock(&map->lock, 0, file, line); map->timestamp++; } void _vm_map_unlock(vm_map_t map, const char *file, int line) { if (map->system_map) _mtx_unlock_flags(&map->system_mtx, 0, file, line); else _sx_xunlock(&map->lock, file, line); } void _vm_map_lock_read(vm_map_t map, const char *file, int line) { if (map->system_map) _mtx_lock_flags(&map->system_mtx, 0, file, line); else (void)_sx_xlock(&map->lock, 0, file, line); } void _vm_map_unlock_read(vm_map_t map, const char *file, int line) { if (map->system_map) _mtx_unlock_flags(&map->system_mtx, 0, file, line); else _sx_xunlock(&map->lock, file, line); } int _vm_map_trylock(vm_map_t map, const char *file, int line) { int error; error = map->system_map ? !_mtx_trylock(&map->system_mtx, 0, file, line) : !_sx_try_xlock(&map->lock, file, line); if (error == 0) map->timestamp++; return (error == 0); } int _vm_map_trylock_read(vm_map_t map, const char *file, int line) { int error; error = map->system_map ? !_mtx_trylock(&map->system_mtx, 0, file, line) : !_sx_try_xlock(&map->lock, file, line); return (error == 0); } int _vm_map_lock_upgrade(vm_map_t map, const char *file, int line) { #ifdef INVARIANTS if (map->system_map) { _mtx_assert(&map->system_mtx, MA_OWNED, file, line); } else _sx_assert(&map->lock, SX_XLOCKED, file, line); #endif map->timestamp++; return (0); } void _vm_map_lock_downgrade(vm_map_t map, const char *file, int line) { #ifdef INVARIANTS if (map->system_map) { _mtx_assert(&map->system_mtx, MA_OWNED, file, line); } else _sx_assert(&map->lock, SX_XLOCKED, file, line); #endif } /* * vm_map_unlock_and_wait: */ int vm_map_unlock_and_wait(vm_map_t map, boolean_t user_wait) { mtx_lock(&map_sleep_mtx); vm_map_unlock(map); return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps", 0)); } /* * vm_map_wakeup: */ void vm_map_wakeup(vm_map_t map) { /* * Acquire and release map_sleep_mtx to prevent a wakeup() * from being performed (and lost) between the vm_map_unlock() * and the msleep() in vm_map_unlock_and_wait(). */ mtx_lock(&map_sleep_mtx); mtx_unlock(&map_sleep_mtx); wakeup(&map->root); } long vmspace_resident_count(struct vmspace *vmspace) { return pmap_resident_count(vmspace_pmap(vmspace)); } long vmspace_wired_count(struct vmspace *vmspace) { return pmap_wired_count(vmspace_pmap(vmspace)); } /* * vm_map_create: * * Creates and returns a new empty VM map with * the given physical map structure, and having * the given lower and upper address bounds. */ vm_map_t vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max) { vm_map_t result; result = uma_zalloc(mapzone, M_WAITOK); CTR1(KTR_VM, "vm_map_create: %p", result); _vm_map_init(result, min, max); result->pmap = pmap; return (result); } /* * Initialize an existing vm_map structure * such as that in the vmspace structure. * The pmap is set elsewhere. */ static void _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max) { map->header.next = map->header.prev = &map->header; map->needs_wakeup = FALSE; map->system_map = 0; map->min_offset = min; map->max_offset = max; map->flags = 0; map->root = NULL; map->timestamp = 0; } void vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max) { _vm_map_init(map, min, max); mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK); sx_init(&map->lock, "user map"); } /* * vm_map_entry_dispose: [ internal use only ] * * Inverse of vm_map_entry_create. */ static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry) { uma_zfree(map->system_map ? kmapentzone : mapentzone, entry); } /* * vm_map_entry_create: [ internal use only ] * * Allocates a VM map entry for insertion. * No entry fields are filled in. */ static vm_map_entry_t vm_map_entry_create(vm_map_t map) { vm_map_entry_t new_entry; if (map->system_map) new_entry = uma_zalloc(kmapentzone, M_NOWAIT); else new_entry = uma_zalloc(mapentzone, M_WAITOK); if (new_entry == NULL) panic("vm_map_entry_create: kernel resources exhausted"); return (new_entry); } /* * vm_map_entry_set_behavior: * * Set the expected access behavior, either normal, random, or * sequential. */ static inline void vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior) { entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) | (behavior & MAP_ENTRY_BEHAV_MASK); } /* * vm_map_entry_set_max_free: * * Set the max_free field in a vm_map_entry. */ static inline void vm_map_entry_set_max_free(vm_map_entry_t entry) { entry->max_free = entry->adj_free; if (entry->left != NULL && entry->left->max_free > entry->max_free) entry->max_free = entry->left->max_free; if (entry->right != NULL && entry->right->max_free > entry->max_free) entry->max_free = entry->right->max_free; } /* * vm_map_entry_splay: * * The Sleator and Tarjan top-down splay algorithm with the * following variation. Max_free must be computed bottom-up, so * on the downward pass, maintain the left and right spines in * reverse order. Then, make a second pass up each side to fix * the pointers and compute max_free. The time bound is O(log n) * amortized. * * The new root is the vm_map_entry containing "addr", or else an * adjacent entry (lower or higher) if addr is not in the tree. * * The map must be locked, and leaves it so. * * Returns: the new root. */ static vm_map_entry_t vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root) { vm_map_entry_t llist, rlist; vm_map_entry_t ltree, rtree; vm_map_entry_t y; /* Special case of empty tree. */ if (root == NULL) return (root); /* * Pass One: Splay down the tree until we find addr or a NULL * pointer where addr would go. llist and rlist are the two * sides in reverse order (bottom-up), with llist linked by * the right pointer and rlist linked by the left pointer in * the vm_map_entry. Wait until Pass Two to set max_free on * the two spines. */ llist = NULL; rlist = NULL; for (;;) { /* root is never NULL in here. */ if (addr < root->start) { y = root->left; if (y == NULL) break; if (addr < y->start && y->left != NULL) { /* Rotate right and put y on rlist. */ root->left = y->right; y->right = root; vm_map_entry_set_max_free(root); root = y->left; y->left = rlist; rlist = y; } else { /* Put root on rlist. */ root->left = rlist; rlist = root; root = y; } } else { y = root->right; if (addr < root->end || y == NULL) break; if (addr >= y->end && y->right != NULL) { /* Rotate left and put y on llist. */ root->right = y->left; y->left = root; vm_map_entry_set_max_free(root); root = y->right; y->right = llist; llist = y; } else { /* Put root on llist. */ root->right = llist; llist = root; root = y; } } } /* * Pass Two: Walk back up the two spines, flip the pointers * and set max_free. The subtrees of the root go at the * bottom of llist and rlist. */ ltree = root->left; while (llist != NULL) { y = llist->right; llist->right = ltree; vm_map_entry_set_max_free(llist); ltree = llist; llist = y; } rtree = root->right; while (rlist != NULL) { y = rlist->left; rlist->left = rtree; vm_map_entry_set_max_free(rlist); rtree = rlist; rlist = y; } /* * Final assembly: add ltree and rtree as subtrees of root. */ root->left = ltree; root->right = rtree; vm_map_entry_set_max_free(root); return (root); } /* * vm_map_entry_{un,}link: * * Insert/remove entries from maps. */ static void vm_map_entry_link(vm_map_t map, vm_map_entry_t after_where, vm_map_entry_t entry) { CTR4(KTR_VM, "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map, map->nentries, entry, after_where); map->nentries++; entry->prev = after_where; entry->next = after_where->next; entry->next->prev = entry; after_where->next = entry; if (after_where != &map->header) { if (after_where != map->root) vm_map_entry_splay(after_where->start, map->root); entry->right = after_where->right; entry->left = after_where; after_where->right = NULL; after_where->adj_free = entry->start - after_where->end; vm_map_entry_set_max_free(after_where); } else { entry->right = map->root; entry->left = NULL; } entry->adj_free = (entry->next == &map->header ? map->max_offset : entry->next->start) - entry->end; vm_map_entry_set_max_free(entry); map->root = entry; } static void vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry) { vm_map_entry_t next, prev, root; if (entry != map->root) vm_map_entry_splay(entry->start, map->root); if (entry->left == NULL) root = entry->right; else { root = vm_map_entry_splay(entry->start, entry->left); root->right = entry->right; root->adj_free = (entry->next == &map->header ? map->max_offset : entry->next->start) - root->end; vm_map_entry_set_max_free(root); } map->root = root; prev = entry->prev; next = entry->next; next->prev = prev; prev->next = next; map->nentries--; CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map, map->nentries, entry); } /* * vm_map_entry_resize_free: * * Recompute the amount of free space following a vm_map_entry * and propagate that value up the tree. Call this function after * resizing a map entry in-place, that is, without a call to * vm_map_entry_link() or _unlink(). * * The map must be locked, and leaves it so. */ static void vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry) { /* * Using splay trees without parent pointers, propagating * max_free up the tree is done by moving the entry to the * root and making the change there. */ if (entry != map->root) map->root = vm_map_entry_splay(entry->start, map->root); entry->adj_free = (entry->next == &map->header ? map->max_offset : entry->next->start) - entry->end; vm_map_entry_set_max_free(entry); } /* * vm_map_lookup_entry: [ internal use only ] * * Finds the map entry containing (or * immediately preceding) the specified address * in the given map; the entry is returned * in the "entry" parameter. The boolean * result indicates whether the address is * actually contained in the map. */ boolean_t vm_map_lookup_entry( vm_map_t map, vm_offset_t address, vm_map_entry_t *entry) /* OUT */ { vm_map_entry_t cur; cur = vm_map_entry_splay(address, map->root); if (cur == NULL) *entry = &map->header; else { map->root = cur; if (address >= cur->start) { *entry = cur; if (cur->end > address) return (TRUE); } else *entry = cur->prev; } return (FALSE); } /* * vm_map_insert: * * Inserts the given whole VM object into the target * map at the specified address range. The object's * size should match that of the address range. * * Requires that the map be locked, and leaves it so. * * If object is non-NULL, ref count must be bumped by caller * prior to making call to account for the new entry. */ int vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow) { vm_map_entry_t new_entry; vm_map_entry_t prev_entry; vm_map_entry_t temp_entry; vm_eflags_t protoeflags; /* * Check that the start and end points are not bogus. */ if ((start < map->min_offset) || (end > map->max_offset) || (start >= end)) return (KERN_INVALID_ADDRESS); /* * Find the entry prior to the proposed starting address; if it's part * of an existing entry, this range is bogus. */ if (vm_map_lookup_entry(map, start, &temp_entry)) return (KERN_NO_SPACE); prev_entry = temp_entry; /* * Assert that the next entry doesn't overlap the end point. */ if ((prev_entry->next != &map->header) && (prev_entry->next->start < end)) return (KERN_NO_SPACE); protoeflags = 0; if (cow & MAP_COPY_ON_WRITE) protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY; if (cow & MAP_NOFAULT) { protoeflags |= MAP_ENTRY_NOFAULT; KASSERT(object == NULL, ("vm_map_insert: paradoxical MAP_NOFAULT request")); } if (cow & MAP_DISABLE_SYNCER) protoeflags |= MAP_ENTRY_NOSYNC; if (cow & MAP_DISABLE_COREDUMP) protoeflags |= MAP_ENTRY_NOCOREDUMP; if (object != NULL) { /* * OBJ_ONEMAPPING must be cleared unless this mapping * is trivially proven to be the only mapping for any * of the object's pages. (Object granularity * reference counting is insufficient to recognize * aliases with precision.) */ VM_OBJECT_LOCK(object); if (object->ref_count > 1 || object->shadow_count != 0) vm_object_clear_flag(object, OBJ_ONEMAPPING); VM_OBJECT_UNLOCK(object); } else if ((prev_entry != &map->header) && (prev_entry->eflags == protoeflags) && (prev_entry->end == start) && (prev_entry->wired_count == 0) && ((prev_entry->object.vm_object == NULL) || vm_object_coalesce(prev_entry->object.vm_object, prev_entry->offset, (vm_size_t)(prev_entry->end - prev_entry->start), (vm_size_t)(end - prev_entry->end)))) { /* * We were able to extend the object. Determine if we * can extend the previous map entry to include the * new range as well. */ if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) && (prev_entry->protection == prot) && (prev_entry->max_protection == max)) { map->size += (end - prev_entry->end); prev_entry->end = end; vm_map_entry_resize_free(map, prev_entry); vm_map_simplify_entry(map, prev_entry); return (KERN_SUCCESS); } /* * If we can extend the object but cannot extend the * map entry, we have to create a new map entry. We * must bump the ref count on the extended object to * account for it. object may be NULL. */ object = prev_entry->object.vm_object; offset = prev_entry->offset + (prev_entry->end - prev_entry->start); vm_object_reference(object); } /* * NOTE: if conditionals fail, object can be NULL here. This occurs * in things like the buffer map where we manage kva but do not manage * backing objects. */ /* * Create a new entry */ new_entry = vm_map_entry_create(map); new_entry->start = start; new_entry->end = end; new_entry->eflags = protoeflags; new_entry->object.vm_object = object; new_entry->offset = offset; new_entry->avail_ssize = 0; new_entry->inheritance = VM_INHERIT_DEFAULT; new_entry->protection = prot; new_entry->max_protection = max; new_entry->wired_count = 0; /* * Insert the new entry into the list */ vm_map_entry_link(map, prev_entry, new_entry); map->size += new_entry->end - new_entry->start; #if 0 /* * Temporarily removed to avoid MAP_STACK panic, due to * MAP_STACK being a huge hack. Will be added back in * when MAP_STACK (and the user stack mapping) is fixed. */ /* * It may be possible to simplify the entry */ vm_map_simplify_entry(map, new_entry); #endif if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) { vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset), end - start, cow & MAP_PREFAULT_PARTIAL); } return (KERN_SUCCESS); } /* * vm_map_findspace: * * Find the first fit (lowest VM address) for "length" free bytes * beginning at address >= start in the given map. * * In a vm_map_entry, "adj_free" is the amount of free space * adjacent (higher address) to this entry, and "max_free" is the * maximum amount of contiguous free space in its subtree. This * allows finding a free region in one path down the tree, so * O(log n) amortized with splay trees. * * The map must be locked, and leaves it so. * * Returns: 0 on success, and starting address in *addr, * 1 if insufficient space. */ int vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length, vm_offset_t *addr) /* OUT */ { vm_map_entry_t entry; vm_offset_t end, st; /* * Request must fit within min/max VM address and must avoid * address wrap. */ if (start < map->min_offset) start = map->min_offset; if (start + length > map->max_offset || start + length < start) return (1); /* Empty tree means wide open address space. */ if (map->root == NULL) { *addr = start; goto found; } /* * After splay, if start comes before root node, then there * must be a gap from start to the root. */ map->root = vm_map_entry_splay(start, map->root); if (start + length <= map->root->start) { *addr = start; goto found; } /* * Root is the last node that might begin its gap before * start, and this is the last comparison where address * wrap might be a problem. */ st = (start > map->root->end) ? start : map->root->end; if (length <= map->root->end + map->root->adj_free - st) { *addr = st; goto found; } /* With max_free, can immediately tell if no solution. */ entry = map->root->right; if (entry == NULL || length > entry->max_free) return (1); /* * Search the right subtree in the order: left subtree, root, * right subtree (first fit). The previous splay implies that * all regions in the right subtree have addresses > start. */ while (entry != NULL) { if (entry->left != NULL && entry->left->max_free >= length) entry = entry->left; else if (entry->adj_free >= length) { *addr = entry->end; goto found; } else entry = entry->right; } /* Can't get here, so panic if we do. */ panic("vm_map_findspace: max_free corrupt"); found: /* Expand the kernel pmap, if necessary. */ if (map == kernel_map) { end = round_page(*addr + length); if (end > kernel_vm_end) pmap_growkernel(end); } return (0); } int vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr /* IN/OUT */, vm_size_t length, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t start, end; int result; start = *addr; vm_map_lock(map); end = start + length; VM_MAP_RANGE_CHECK(map, start, end); (void) vm_map_delete(map, start, end); result = vm_map_insert(map, object, offset, start, end, prot, max, cow); vm_map_unlock(map); return (result); } /* * vm_map_find finds an unallocated region in the target address * map with the given length. The search is defined to be * first-fit from the specified address; the region found is * returned in the same parameter. * * If object is non-NULL, ref count must be bumped by caller * prior to making call to account for the new entry. */ int vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, /* IN/OUT */ vm_size_t length, boolean_t find_space, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t start; int result; start = *addr; vm_map_lock(map); if (find_space) { if (vm_map_findspace(map, start, length, addr)) { vm_map_unlock(map); return (KERN_NO_SPACE); } start = *addr; } result = vm_map_insert(map, object, offset, start, start + length, prot, max, cow); vm_map_unlock(map); return (result); } /* * vm_map_simplify_entry: * * Simplify the given map entry by merging with either neighbor. This * routine also has the ability to merge with both neighbors. * * The map must be locked. * * This routine guarentees that the passed entry remains valid (though * possibly extended). When merging, this routine may delete one or * both neighbors. */ void vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) { vm_map_entry_t next, prev; vm_size_t prevsize, esize; if (entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)) return; prev = entry->prev; if (prev != &map->header) { prevsize = prev->end - prev->start; if ( (prev->end == entry->start) && (prev->object.vm_object == entry->object.vm_object) && (!prev->object.vm_object || (prev->offset + prevsize == entry->offset)) && (prev->eflags == entry->eflags) && (prev->protection == entry->protection) && (prev->max_protection == entry->max_protection) && (prev->inheritance == entry->inheritance) && (prev->wired_count == entry->wired_count)) { vm_map_entry_unlink(map, prev); entry->start = prev->start; entry->offset = prev->offset; if (entry->prev != &map->header) vm_map_entry_resize_free(map, entry->prev); if (prev->object.vm_object) vm_object_deallocate(prev->object.vm_object); vm_map_entry_dispose(map, prev); } } next = entry->next; if (next != &map->header) { esize = entry->end - entry->start; if ((entry->end == next->start) && (next->object.vm_object == entry->object.vm_object) && (!entry->object.vm_object || (entry->offset + esize == next->offset)) && (next->eflags == entry->eflags) && (next->protection == entry->protection) && (next->max_protection == entry->max_protection) && (next->inheritance == entry->inheritance) && (next->wired_count == entry->wired_count)) { vm_map_entry_unlink(map, next); entry->end = next->end; vm_map_entry_resize_free(map, entry); if (next->object.vm_object) vm_object_deallocate(next->object.vm_object); vm_map_entry_dispose(map, next); } } } /* * vm_map_clip_start: [ internal use only ] * * Asserts that the given entry begins at or after * the specified address; if necessary, * it splits the entry into two. */ #define vm_map_clip_start(map, entry, startaddr) \ { \ if (startaddr > entry->start) \ _vm_map_clip_start(map, entry, startaddr); \ } /* * This routine is called only when it is known that * the entry must be split. */ static void _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start) { vm_map_entry_t new_entry; /* * Split off the front portion -- note that we must insert the new * entry BEFORE this one, so that this entry has the specified * starting address. */ vm_map_simplify_entry(map, entry); /* * If there is no object backing this entry, we might as well create * one now. If we defer it, an object can get created after the map * is clipped, and individual objects will be created for the split-up * map. This is a bit of a hack, but is also about the best place to * put this improvement. */ if (entry->object.vm_object == NULL && !map->system_map) { vm_object_t object; object = vm_object_allocate(OBJT_DEFAULT, atop(entry->end - entry->start)); entry->object.vm_object = object; entry->offset = 0; } new_entry = vm_map_entry_create(map); *new_entry = *entry; new_entry->end = start; entry->offset += (start - entry->start); entry->start = start; vm_map_entry_link(map, entry->prev, new_entry); if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); } } /* * vm_map_clip_end: [ internal use only ] * * Asserts that the given entry ends at or before * the specified address; if necessary, * it splits the entry into two. */ #define vm_map_clip_end(map, entry, endaddr) \ { \ if ((endaddr) < (entry->end)) \ _vm_map_clip_end((map), (entry), (endaddr)); \ } /* * This routine is called only when it is known that * the entry must be split. */ static void _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end) { vm_map_entry_t new_entry; /* * If there is no object backing this entry, we might as well create * one now. If we defer it, an object can get created after the map * is clipped, and individual objects will be created for the split-up * map. This is a bit of a hack, but is also about the best place to * put this improvement. */ if (entry->object.vm_object == NULL && !map->system_map) { vm_object_t object; object = vm_object_allocate(OBJT_DEFAULT, atop(entry->end - entry->start)); entry->object.vm_object = object; entry->offset = 0; } /* * Create a new entry and insert it AFTER the specified entry */ new_entry = vm_map_entry_create(map); *new_entry = *entry; new_entry->start = entry->end = end; new_entry->offset += (end - entry->start); vm_map_entry_link(map, entry, new_entry); if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); } } /* * vm_map_submap: [ kernel use only ] * * Mark the given range as handled by a subordinate map. * * This range must have been created with vm_map_find, * and no other operations may have been performed on this * range prior to calling vm_map_submap. * * Only a limited number of operations can be performed * within this rage after calling vm_map_submap: * vm_fault * [Don't try vm_map_copy!] * * To remove a submapping, one must first remove the * range from the superior map, and then destroy the * submap (if desired). [Better yet, don't try it.] */ int vm_map_submap( vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap) { vm_map_entry_t entry; int result = KERN_INVALID_ARGUMENT; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { vm_map_clip_start(map, entry, start); } else entry = entry->next; vm_map_clip_end(map, entry, end); if ((entry->start == start) && (entry->end == end) && ((entry->eflags & MAP_ENTRY_COW) == 0) && (entry->object.vm_object == NULL)) { entry->object.sub_map = submap; entry->eflags |= MAP_ENTRY_IS_SUB_MAP; result = KERN_SUCCESS; } vm_map_unlock(map); return (result); } /* * The maximum number of pages to map */ #define MAX_INIT_PT 96 /* * vm_map_pmap_enter: * * Preload read-only mappings for the given object's resident pages into * the given map. This eliminates the soft faults on process startup and * immediately after an mmap(2). Unless the given flags include * MAP_PREFAULT_MADVISE, cached pages are not reactivated and mapped. */ void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags) { vm_offset_t start; vm_page_t p, p_start; vm_pindex_t psize, tmpidx; boolean_t are_queues_locked; if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL) return; VM_OBJECT_LOCK(object); if (object->type == OBJT_DEVICE) { pmap_object_init_pt(map->pmap, addr, object, pindex, size); goto unlock_return; } psize = atop(size); if (object->type != OBJT_VNODE || ((flags & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && (object->resident_page_count > MAX_INIT_PT))) { goto unlock_return; } if (psize + pindex > object->size) { if (object->size < pindex) goto unlock_return; psize = object->size - pindex; } are_queues_locked = FALSE; start = 0; p_start = NULL; if ((p = TAILQ_FIRST(&object->memq)) != NULL) { if (p->pindex < pindex) { p = vm_page_splay(pindex, object->root); if ((object->root = p)->pindex < pindex) p = TAILQ_NEXT(p, listq); } } /* * Assert: the variable p is either (1) the page with the * least pindex greater than or equal to the parameter pindex * or (2) NULL. */ for (; p != NULL && (tmpidx = p->pindex - pindex) < psize; p = TAILQ_NEXT(p, listq)) { /* * don't allow an madvise to blow away our really * free pages allocating pv entries. */ if ((flags & MAP_PREFAULT_MADVISE) && cnt.v_free_count < cnt.v_free_reserved) { psize = tmpidx; break; } if ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL && (p->busy == 0)) { if (p_start == NULL) { start = addr + ptoa(tmpidx); p_start = p; } } else if (p_start != NULL) { if (!are_queues_locked) { are_queues_locked = TRUE; vm_page_lock_queues(); } pmap_enter_object(map->pmap, start, addr + ptoa(tmpidx), p_start, prot); p_start = NULL; } } if (p_start != NULL) { if (!are_queues_locked) { are_queues_locked = TRUE; vm_page_lock_queues(); } pmap_enter_object(map->pmap, start, addr + ptoa(psize), p_start, prot); } if (are_queues_locked) vm_page_unlock_queues(); unlock_return: VM_OBJECT_UNLOCK(object); } /* * vm_map_protect: * * Sets the protection of the specified address * region in the target map. If "set_max" is * specified, the maximum protection is to be set; * otherwise, only the current protection is affected. */ int vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_prot_t new_prot, boolean_t set_max) { vm_map_entry_t current; vm_map_entry_t entry; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { vm_map_clip_start(map, entry, start); } else { entry = entry->next; } /* * Make a first pass to check for protection violations. */ current = entry; while ((current != &map->header) && (current->start < end)) { if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_unlock(map); return (KERN_INVALID_ARGUMENT); } if ((new_prot & current->max_protection) != new_prot) { vm_map_unlock(map); return (KERN_PROTECTION_FAILURE); } current = current->next; } /* * Go back and fix up protections. [Note that clipping is not * necessary the second time.] */ current = entry; while ((current != &map->header) && (current->start < end)) { vm_prot_t old_prot; vm_map_clip_end(map, current, end); old_prot = current->protection; if (set_max) current->protection = (current->max_protection = new_prot) & old_prot; else current->protection = new_prot; /* * Update physical map if necessary. Worry about copy-on-write * here -- CHECK THIS XXX */ if (current->protection != old_prot) { #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \ VM_PROT_ALL) pmap_protect(map->pmap, current->start, current->end, current->protection & MASK(current)); #undef MASK } vm_map_simplify_entry(map, current); current = current->next; } vm_map_unlock(map); return (KERN_SUCCESS); } /* * vm_map_madvise: * * This routine traverses a processes map handling the madvise * system call. Advisories are classified as either those effecting * the vm_map_entry structure, or those effecting the underlying * objects. */ int vm_map_madvise( vm_map_t map, vm_offset_t start, vm_offset_t end, int behav) { vm_map_entry_t current, entry; int modify_map = 0; /* * Some madvise calls directly modify the vm_map_entry, in which case * we need to use an exclusive lock on the map and we need to perform * various clipping operations. Otherwise we only need a read-lock * on the map. */ switch(behav) { case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: case MADV_NOSYNC: case MADV_AUTOSYNC: case MADV_NOCORE: case MADV_CORE: modify_map = 1; vm_map_lock(map); break; case MADV_WILLNEED: case MADV_DONTNEED: case MADV_FREE: vm_map_lock_read(map); break; default: return (KERN_INVALID_ARGUMENT); } /* * Locate starting entry and clip if necessary. */ VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { if (modify_map) vm_map_clip_start(map, entry, start); } else { entry = entry->next; } if (modify_map) { /* * madvise behaviors that are implemented in the vm_map_entry. * * We clip the vm_map_entry so that behavioral changes are * limited to the specified address range. */ for (current = entry; (current != &map->header) && (current->start < end); current = current->next ) { if (current->eflags & MAP_ENTRY_IS_SUB_MAP) continue; vm_map_clip_end(map, current, end); switch (behav) { case MADV_NORMAL: vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL); break; case MADV_SEQUENTIAL: vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL); break; case MADV_RANDOM: vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM); break; case MADV_NOSYNC: current->eflags |= MAP_ENTRY_NOSYNC; break; case MADV_AUTOSYNC: current->eflags &= ~MAP_ENTRY_NOSYNC; break; case MADV_NOCORE: current->eflags |= MAP_ENTRY_NOCOREDUMP; break; case MADV_CORE: current->eflags &= ~MAP_ENTRY_NOCOREDUMP; break; default: break; } vm_map_simplify_entry(map, current); } vm_map_unlock(map); } else { vm_pindex_t pindex; int count; /* * madvise behaviors that are implemented in the underlying * vm_object. * * Since we don't clip the vm_map_entry, we have to clip * the vm_object pindex and count. */ for (current = entry; (current != &map->header) && (current->start < end); current = current->next ) { vm_offset_t useStart; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) continue; pindex = OFF_TO_IDX(current->offset); count = atop(current->end - current->start); useStart = current->start; if (current->start < start) { pindex += atop(start - current->start); count -= atop(start - current->start); useStart = start; } if (current->end > end) count -= atop(current->end - end); if (count <= 0) continue; vm_object_madvise(current->object.vm_object, pindex, count, behav); if (behav == MADV_WILLNEED) { vm_map_pmap_enter(map, useStart, current->protection, current->object.vm_object, pindex, (count << PAGE_SHIFT), MAP_PREFAULT_MADVISE ); } } vm_map_unlock_read(map); } return (0); } /* * vm_map_inherit: * * Sets the inheritance of the specified address * range in the target map. Inheritance * affects how the map will be shared with * child maps at the time of vm_map_fork. */ int vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_inherit_t new_inheritance) { vm_map_entry_t entry; vm_map_entry_t temp_entry; switch (new_inheritance) { case VM_INHERIT_NONE: case VM_INHERIT_COPY: case VM_INHERIT_SHARE: break; default: return (KERN_INVALID_ARGUMENT); } vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &temp_entry)) { entry = temp_entry; vm_map_clip_start(map, entry, start); } else entry = temp_entry->next; while ((entry != &map->header) && (entry->start < end)) { vm_map_clip_end(map, entry, end); entry->inheritance = new_inheritance; vm_map_simplify_entry(map, entry); entry = entry->next; } vm_map_unlock(map); return (KERN_SUCCESS); } /* * vm_map_unwire: * * Implements both kernel and user unwiring. */ int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { vm_map_entry_t entry, first_entry, tmp_entry; vm_offset_t saved_start; unsigned int last_timestamp; int rv; boolean_t need_wakeup, result, user_unwire; user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) { if (flags & VM_MAP_WIRE_HOLESOK) first_entry = first_entry->next; else { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } } last_timestamp = map->timestamp; entry = first_entry; while (entry != &map->header && entry->start < end) { if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { /* * We have not yet clipped the entry. */ saved_start = (start >= entry->start) ? start : entry->start; entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; if (vm_map_unlock_and_wait(map, user_unwire)) { /* * Allow interruption of user unwiring? */ } vm_map_lock(map); if (last_timestamp+1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. * Specifically, the entry may have been * clipped, merged, or deleted. */ if (!vm_map_lookup_entry(map, saved_start, &tmp_entry)) { if (flags & VM_MAP_WIRE_HOLESOK) tmp_entry = tmp_entry->next; else { if (saved_start == start) { /* * First_entry has been deleted. */ vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } end = saved_start; rv = KERN_INVALID_ADDRESS; goto done; } } if (entry == first_entry) first_entry = tmp_entry; else first_entry = NULL; entry = tmp_entry; } last_timestamp = map->timestamp; continue; } vm_map_clip_start(map, entry, start); vm_map_clip_end(map, entry, end); /* * Mark the entry in case the map lock is released. (See * above.) */ entry->eflags |= MAP_ENTRY_IN_TRANSITION; /* * Check the map for holes in the specified region. * If VM_MAP_WIRE_HOLESOK was specified, skip this check. */ if (((flags & VM_MAP_WIRE_HOLESOK) == 0) && (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end))) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; } /* * If system unwiring, require that the entry is system wired. */ if (!user_unwire && vm_map_entry_system_wired_count(entry) == 0) { end = entry->end; rv = KERN_INVALID_ARGUMENT; goto done; } entry = entry->next; } rv = KERN_SUCCESS; done: need_wakeup = FALSE; if (first_entry == NULL) { result = vm_map_lookup_entry(map, start, &first_entry); if (!result && (flags & VM_MAP_WIRE_HOLESOK)) first_entry = first_entry->next; else KASSERT(result, ("vm_map_unwire: lookup failed")); } entry = first_entry; while (entry != &map->header && entry->start < end) { if (rv == KERN_SUCCESS && (!user_unwire || (entry->eflags & MAP_ENTRY_USER_WIRED))) { if (user_unwire) entry->eflags &= ~MAP_ENTRY_USER_WIRED; entry->wired_count--; if (entry->wired_count == 0) { /* * Retain the map lock. */ vm_fault_unwire(map, entry->start, entry->end, entry->object.vm_object != NULL && entry->object.vm_object->type == OBJT_DEVICE); } } KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION, ("vm_map_unwire: in-transition flag missing")); entry->eflags &= ~MAP_ENTRY_IN_TRANSITION; if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; need_wakeup = TRUE; } vm_map_simplify_entry(map, entry); entry = entry->next; } vm_map_unlock(map); if (need_wakeup) vm_map_wakeup(map); return (rv); } /* * vm_map_wire: * * Implements both kernel and user wiring. */ int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { vm_map_entry_t entry, first_entry, tmp_entry; vm_offset_t saved_end, saved_start; unsigned int last_timestamp; int rv; boolean_t fictitious, need_wakeup, result, user_wire; user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) { if (flags & VM_MAP_WIRE_HOLESOK) first_entry = first_entry->next; else { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } } last_timestamp = map->timestamp; entry = first_entry; while (entry != &map->header && entry->start < end) { if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { /* * We have not yet clipped the entry. */ saved_start = (start >= entry->start) ? start : entry->start; entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; if (vm_map_unlock_and_wait(map, user_wire)) { /* * Allow interruption of user wiring? */ } vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. * Specifically, the entry may have been * clipped, merged, or deleted. */ if (!vm_map_lookup_entry(map, saved_start, &tmp_entry)) { if (flags & VM_MAP_WIRE_HOLESOK) tmp_entry = tmp_entry->next; else { if (saved_start == start) { /* * first_entry has been deleted. */ vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } end = saved_start; rv = KERN_INVALID_ADDRESS; goto done; } } if (entry == first_entry) first_entry = tmp_entry; else first_entry = NULL; entry = tmp_entry; } last_timestamp = map->timestamp; continue; } vm_map_clip_start(map, entry, start); vm_map_clip_end(map, entry, end); /* * Mark the entry in case the map lock is released. (See * above.) */ entry->eflags |= MAP_ENTRY_IN_TRANSITION; /* * */ if (entry->wired_count == 0) { entry->wired_count++; saved_start = entry->start; saved_end = entry->end; fictitious = entry->object.vm_object != NULL && entry->object.vm_object->type == OBJT_DEVICE; /* * Release the map lock, relying on the in-transition * mark. */ vm_map_unlock(map); rv = vm_fault_wire(map, saved_start, saved_end, user_wire, fictitious); vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. The entry * may have been clipped, but NOT merged or * deleted. */ result = vm_map_lookup_entry(map, saved_start, &tmp_entry); KASSERT(result, ("vm_map_wire: lookup failed")); if (entry == first_entry) first_entry = tmp_entry; else first_entry = NULL; entry = tmp_entry; while (entry->end < saved_end) { if (rv != KERN_SUCCESS) { KASSERT(entry->wired_count == 1, ("vm_map_wire: bad count")); entry->wired_count = -1; } entry = entry->next; } } last_timestamp = map->timestamp; if (rv != KERN_SUCCESS) { KASSERT(entry->wired_count == 1, ("vm_map_wire: bad count")); /* * Assign an out-of-range value to represent * the failure to wire this entry. */ entry->wired_count = -1; end = entry->end; goto done; } } else if (!user_wire || (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { entry->wired_count++; } /* * Check the map for holes in the specified region. * If VM_MAP_WIRE_HOLESOK was specified, skip this check. */ if (((flags & VM_MAP_WIRE_HOLESOK) == 0) && (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end))) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; } entry = entry->next; } rv = KERN_SUCCESS; done: need_wakeup = FALSE; if (first_entry == NULL) { result = vm_map_lookup_entry(map, start, &first_entry); if (!result && (flags & VM_MAP_WIRE_HOLESOK)) first_entry = first_entry->next; else KASSERT(result, ("vm_map_wire: lookup failed")); } entry = first_entry; while (entry != &map->header && entry->start < end) { if (rv == KERN_SUCCESS) { if (user_wire) entry->eflags |= MAP_ENTRY_USER_WIRED; } else if (entry->wired_count == -1) { /* * Wiring failed on this entry. Thus, unwiring is * unnecessary. */ entry->wired_count = 0; } else { if (!user_wire || (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) entry->wired_count--; if (entry->wired_count == 0) { /* * Retain the map lock. */ vm_fault_unwire(map, entry->start, entry->end, entry->object.vm_object != NULL && entry->object.vm_object->type == OBJT_DEVICE); } } KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION, ("vm_map_wire: in-transition flag missing")); entry->eflags &= ~MAP_ENTRY_IN_TRANSITION; if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; need_wakeup = TRUE; } vm_map_simplify_entry(map, entry); entry = entry->next; } vm_map_unlock(map); if (need_wakeup) vm_map_wakeup(map); return (rv); } /* * vm_map_sync * * Push any dirty cached pages in the address range to their pager. * If syncio is TRUE, dirty pages are written synchronously. * If invalidate is TRUE, any cached pages are freed as well. * * If the size of the region from start to end is zero, we are * supposed to flush all modified pages within the region containing * start. Unfortunately, a region can be split or coalesced with * neighboring regions, making it difficult to determine what the * original region was. Therefore, we approximate this requirement by * flushing the current region containing start. * * Returns an error if any part of the specified range is not mapped. */ int vm_map_sync( vm_map_t map, vm_offset_t start, vm_offset_t end, boolean_t syncio, boolean_t invalidate) { vm_map_entry_t current; vm_map_entry_t entry; vm_size_t size; vm_object_t object; vm_ooffset_t offset; vm_map_lock_read(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &entry)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } else if (start == end) { start = entry->start; end = entry->end; } /* * Make a first pass to check for user-wired memory and holes. */ for (current = entry; current != &map->header && current->start < end; current = current->next) { if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) { vm_map_unlock_read(map); return (KERN_INVALID_ARGUMENT); } if (end > current->end && (current->next == &map->header || current->end != current->next->start)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } } if (invalidate) pmap_remove(map->pmap, start, end); /* * Make a second pass, cleaning/uncaching pages from the indicated * objects as we go. */ for (current = entry; current != &map->header && current->start < end; current = current->next) { offset = current->offset + (start - current->start); size = (end <= current->end ? end : current->end) - start; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_t smap; vm_map_entry_t tentry; vm_size_t tsize; smap = current->object.sub_map; vm_map_lock_read(smap); (void) vm_map_lookup_entry(smap, offset, &tentry); tsize = tentry->end - offset; if (tsize < size) size = tsize; object = tentry->object.vm_object; offset = tentry->offset + (offset - tentry->start); vm_map_unlock_read(smap); } else { object = current->object.vm_object; } vm_object_sync(object, offset, size, syncio, invalidate); start += size; } vm_map_unlock_read(map); return (KERN_SUCCESS); } /* * vm_map_entry_unwire: [ internal use only ] * * Make the region specified by this entry pageable. * * The map in question should be locked. * [This is the reason for this routine's existence.] */ static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry) { vm_fault_unwire(map, entry->start, entry->end, entry->object.vm_object != NULL && entry->object.vm_object->type == OBJT_DEVICE); entry->wired_count = 0; } /* * vm_map_entry_delete: [ internal use only ] * * Deallocate the given entry from the target map. */ static void vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) { vm_object_t object; vm_pindex_t offidxstart, offidxend, count; vm_map_entry_unlink(map, entry); map->size -= entry->end - entry->start; if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && (object = entry->object.vm_object) != NULL) { count = OFF_TO_IDX(entry->end - entry->start); offidxstart = OFF_TO_IDX(entry->offset); offidxend = offidxstart + count; VM_OBJECT_LOCK(object); if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING || object == kernel_object || object == kmem_object)) { vm_object_collapse(object); vm_object_page_remove(object, offidxstart, offidxend, FALSE); if (object->type == OBJT_SWAP) swap_pager_freespace(object, offidxstart, count); if (offidxend >= object->size && offidxstart < object->size) object->size = offidxstart; } VM_OBJECT_UNLOCK(object); vm_object_deallocate(object); } vm_map_entry_dispose(map, entry); } /* * vm_map_delete: [ internal use only ] * * Deallocates the given address range from the target * map. */ int vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) { vm_map_entry_t entry; vm_map_entry_t first_entry; /* * Find the start of the region, and clip it */ if (!vm_map_lookup_entry(map, start, &first_entry)) entry = first_entry->next; else { entry = first_entry; vm_map_clip_start(map, entry, start); } /* * Step through all entries in this region */ while ((entry != &map->header) && (entry->start < end)) { vm_map_entry_t next; /* * Wait for wiring or unwiring of an entry to complete. * Also wait for any system wirings to disappear on * user maps. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 || (vm_map_pmap(map) != kernel_pmap && vm_map_entry_system_wired_count(entry) != 0)) { unsigned int last_timestamp; vm_offset_t saved_start; vm_map_entry_t tmp_entry; saved_start = entry->start; entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; last_timestamp = map->timestamp; (void) vm_map_unlock_and_wait(map, FALSE); vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. * Specifically, the entry may have been * clipped, merged, or deleted. */ if (!vm_map_lookup_entry(map, saved_start, &tmp_entry)) entry = tmp_entry->next; else { entry = tmp_entry; vm_map_clip_start(map, entry, saved_start); } } continue; } vm_map_clip_end(map, entry, end); next = entry->next; /* * Unwire before removing addresses from the pmap; otherwise, * unwiring will put the entries back in the pmap. */ if (entry->wired_count != 0) { vm_map_entry_unwire(map, entry); } pmap_remove(map->pmap, entry->start, entry->end); /* * Delete the entry (which may delete the object) only after * removing all pmap entries pointing to its pages. * (Otherwise, its page frames may be reallocated, and any * modify bits will be set in the wrong object!) */ vm_map_entry_delete(map, entry); entry = next; } return (KERN_SUCCESS); } /* * vm_map_remove: * * Remove the given address range from the target map. * This is the exported form of vm_map_delete. */ int vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) { int result; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); result = vm_map_delete(map, start, end); vm_map_unlock(map); return (result); } /* * vm_map_check_protection: * * Assert that the target map allows the specified privilege on the * entire address region given. The entire region must be allocated. * * WARNING! This code does not and should not check whether the * contents of the region is accessible. For example a smaller file * might be mapped into a larger address space. * * NOTE! This code is also called by munmap(). * * The map must be locked. A read lock is sufficient. */ boolean_t vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_prot_t protection) { vm_map_entry_t entry; vm_map_entry_t tmp_entry; if (!vm_map_lookup_entry(map, start, &tmp_entry)) return (FALSE); entry = tmp_entry; while (start < end) { if (entry == &map->header) return (FALSE); /* * No holes allowed! */ if (start < entry->start) return (FALSE); /* * Check protection associated with entry. */ if ((entry->protection & protection) != protection) return (FALSE); /* go to next entry */ start = entry->end; entry = entry->next; } return (TRUE); } /* * vm_map_copy_entry: * * Copies the contents of the source entry to the destination * entry. The entries *must* be aligned properly. */ static void vm_map_copy_entry( vm_map_t src_map, vm_map_t dst_map, vm_map_entry_t src_entry, vm_map_entry_t dst_entry) { vm_object_t src_object; if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP) return; if (src_entry->wired_count == 0) { /* * If the source entry is marked needs_copy, it is already * write-protected. */ if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) { pmap_protect(src_map->pmap, src_entry->start, src_entry->end, src_entry->protection & ~VM_PROT_WRITE); } /* * Make a copy of the object. */ if ((src_object = src_entry->object.vm_object) != NULL) { VM_OBJECT_LOCK(src_object); if ((src_object->handle == NULL) && (src_object->type == OBJT_DEFAULT || src_object->type == OBJT_SWAP)) { vm_object_collapse(src_object); if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) { vm_object_split(src_entry); src_object = src_entry->object.vm_object; } } vm_object_reference_locked(src_object); vm_object_clear_flag(src_object, OBJ_ONEMAPPING); VM_OBJECT_UNLOCK(src_object); dst_entry->object.vm_object = src_object; src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY); dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY); dst_entry->offset = src_entry->offset; } else { dst_entry->object.vm_object = NULL; dst_entry->offset = 0; } pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start, dst_entry->end - dst_entry->start, src_entry->start); } else { /* * Of course, wired down pages can't be set copy-on-write. * Cause wired pages to be copied into the new map by * simulating faults (the new pages are pageable) */ vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry); } } /* * vmspace_map_entry_forked: * Update the newly-forked vmspace each time a map entry is inherited * or copied. The values for vm_dsize and vm_tsize are approximate * (and mostly-obsolete ideas in the face of mmap(2) et al.) */ static void vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2, vm_map_entry_t entry) { vm_size_t entrysize; vm_offset_t newend; entrysize = entry->end - entry->start; vm2->vm_map.size += entrysize; if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) { vm2->vm_ssize += btoc(entrysize); } else if (entry->start >= (vm_offset_t)vm1->vm_daddr && entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) { newend = MIN(entry->end, (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)); vm2->vm_dsize += btoc(newend - entry->start); } else if (entry->start >= (vm_offset_t)vm1->vm_taddr && entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) { newend = MIN(entry->end, (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)); vm2->vm_tsize += btoc(newend - entry->start); } } /* * vmspace_fork: * Create a new process vmspace structure and vm_map * based on those of an existing process. The new map * is based on the old map, according to the inheritance * values on the regions in that map. * * XXX It might be worth coalescing the entries added to the new vmspace. * * The source map must not be locked. */ struct vmspace * vmspace_fork(struct vmspace *vm1) { struct vmspace *vm2; vm_map_t old_map = &vm1->vm_map; vm_map_t new_map; vm_map_entry_t old_entry; vm_map_entry_t new_entry; vm_object_t object; vm_map_lock(old_map); vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset); + if (vm2 == NULL) + goto unlock_and_return; vm2->vm_taddr = vm1->vm_taddr; vm2->vm_daddr = vm1->vm_daddr; vm2->vm_maxsaddr = vm1->vm_maxsaddr; new_map = &vm2->vm_map; /* XXX */ new_map->timestamp = 1; old_entry = old_map->header.next; while (old_entry != &old_map->header) { if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) panic("vm_map_fork: encountered a submap"); switch (old_entry->inheritance) { case VM_INHERIT_NONE: break; case VM_INHERIT_SHARE: /* * Clone the entry, creating the shared object if necessary. */ object = old_entry->object.vm_object; if (object == NULL) { object = vm_object_allocate(OBJT_DEFAULT, atop(old_entry->end - old_entry->start)); old_entry->object.vm_object = object; old_entry->offset = 0; } /* * Add the reference before calling vm_object_shadow * to insure that a shadow object is created. */ vm_object_reference(object); if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) { vm_object_shadow(&old_entry->object.vm_object, &old_entry->offset, atop(old_entry->end - old_entry->start)); old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; /* Transfer the second reference too. */ vm_object_reference( old_entry->object.vm_object); vm_object_deallocate(object); object = old_entry->object.vm_object; } VM_OBJECT_LOCK(object); vm_object_clear_flag(object, OBJ_ONEMAPPING); VM_OBJECT_UNLOCK(object); /* * Clone the entry, referencing the shared object. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; new_entry->eflags &= ~MAP_ENTRY_USER_WIRED; new_entry->wired_count = 0; /* * Insert the entry into the new map -- we know we're * inserting at the end of the new map. */ vm_map_entry_link(new_map, new_map->header.prev, new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); /* * Update the physical map */ pmap_copy(new_map->pmap, old_map->pmap, new_entry->start, (old_entry->end - old_entry->start), old_entry->start); break; case VM_INHERIT_COPY: /* * Clone the entry and link into the map. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; new_entry->eflags &= ~MAP_ENTRY_USER_WIRED; new_entry->wired_count = 0; new_entry->object.vm_object = NULL; vm_map_entry_link(new_map, new_map->header.prev, new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); vm_map_copy_entry(old_map, new_map, old_entry, new_entry); break; } old_entry = old_entry->next; } - +unlock_and_return: vm_map_unlock(old_map); return (vm2); } int vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_prot_t prot, vm_prot_t max, int cow) { vm_map_entry_t new_entry, prev_entry; vm_offset_t bot, top; vm_size_t init_ssize; int orient, rv; rlim_t vmemlim; /* * The stack orientation is piggybacked with the cow argument. * Extract it into orient and mask the cow argument so that we * don't pass it around further. * NOTE: We explicitly allow bi-directional stacks. */ orient = cow & (MAP_STACK_GROWS_DOWN|MAP_STACK_GROWS_UP); cow &= ~orient; KASSERT(orient != 0, ("No stack grow direction")); if (addrbos < vm_map_min(map) || addrbos > map->max_offset) return (KERN_NO_SPACE); init_ssize = (max_ssize < sgrowsiz) ? max_ssize : sgrowsiz; PROC_LOCK(curthread->td_proc); vmemlim = lim_cur(curthread->td_proc, RLIMIT_VMEM); PROC_UNLOCK(curthread->td_proc); vm_map_lock(map); /* If addr is already mapped, no go */ if (vm_map_lookup_entry(map, addrbos, &prev_entry)) { vm_map_unlock(map); return (KERN_NO_SPACE); } /* If we would blow our VMEM resource limit, no go */ if (map->size + init_ssize > vmemlim) { vm_map_unlock(map); return (KERN_NO_SPACE); } /* * If we can't accomodate max_ssize in the current mapping, no go. * However, we need to be aware that subsequent user mappings might * map into the space we have reserved for stack, and currently this * space is not protected. * * Hopefully we will at least detect this condition when we try to * grow the stack. */ if ((prev_entry->next != &map->header) && (prev_entry->next->start < addrbos + max_ssize)) { vm_map_unlock(map); return (KERN_NO_SPACE); } /* * We initially map a stack of only init_ssize. We will grow as * needed later. Depending on the orientation of the stack (i.e. * the grow direction) we either map at the top of the range, the * bottom of the range or in the middle. * * Note: we would normally expect prot and max to be VM_PROT_ALL, * and cow to be 0. Possibly we should eliminate these as input * parameters, and just pass these values here in the insert call. */ if (orient == MAP_STACK_GROWS_DOWN) bot = addrbos + max_ssize - init_ssize; else if (orient == MAP_STACK_GROWS_UP) bot = addrbos; else bot = round_page(addrbos + max_ssize/2 - init_ssize/2); top = bot + init_ssize; rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow); /* Now set the avail_ssize amount. */ if (rv == KERN_SUCCESS) { if (prev_entry != &map->header) vm_map_clip_end(map, prev_entry, bot); new_entry = prev_entry->next; if (new_entry->end != top || new_entry->start != bot) panic("Bad entry start/end for new stack entry"); new_entry->avail_ssize = max_ssize - init_ssize; if (orient & MAP_STACK_GROWS_DOWN) new_entry->eflags |= MAP_ENTRY_GROWS_DOWN; if (orient & MAP_STACK_GROWS_UP) new_entry->eflags |= MAP_ENTRY_GROWS_UP; } vm_map_unlock(map); return (rv); } /* Attempts to grow a vm stack entry. Returns KERN_SUCCESS if the * desired address is already mapped, or if we successfully grow * the stack. Also returns KERN_SUCCESS if addr is outside the * stack range (this is strange, but preserves compatibility with * the grow function in vm_machdep.c). */ int vm_map_growstack(struct proc *p, vm_offset_t addr) { vm_map_entry_t next_entry, prev_entry; vm_map_entry_t new_entry, stack_entry; struct vmspace *vm = p->p_vmspace; vm_map_t map = &vm->vm_map; vm_offset_t end; size_t grow_amount, max_grow; rlim_t stacklim, vmemlim; int is_procstack, rv; Retry: PROC_LOCK(p); stacklim = lim_cur(p, RLIMIT_STACK); vmemlim = lim_cur(p, RLIMIT_VMEM); PROC_UNLOCK(p); vm_map_lock_read(map); /* If addr is already in the entry range, no need to grow.*/ if (vm_map_lookup_entry(map, addr, &prev_entry)) { vm_map_unlock_read(map); return (KERN_SUCCESS); } next_entry = prev_entry->next; if (!(prev_entry->eflags & MAP_ENTRY_GROWS_UP)) { /* * This entry does not grow upwards. Since the address lies * beyond this entry, the next entry (if one exists) has to * be a downward growable entry. The entry list header is * never a growable entry, so it suffices to check the flags. */ if (!(next_entry->eflags & MAP_ENTRY_GROWS_DOWN)) { vm_map_unlock_read(map); return (KERN_SUCCESS); } stack_entry = next_entry; } else { /* * This entry grows upward. If the next entry does not at * least grow downwards, this is the entry we need to grow. * otherwise we have two possible choices and we have to * select one. */ if (next_entry->eflags & MAP_ENTRY_GROWS_DOWN) { /* * We have two choices; grow the entry closest to * the address to minimize the amount of growth. */ if (addr - prev_entry->end <= next_entry->start - addr) stack_entry = prev_entry; else stack_entry = next_entry; } else stack_entry = prev_entry; } if (stack_entry == next_entry) { KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_DOWN, ("foo")); KASSERT(addr < stack_entry->start, ("foo")); end = (prev_entry != &map->header) ? prev_entry->end : stack_entry->start - stack_entry->avail_ssize; grow_amount = roundup(stack_entry->start - addr, PAGE_SIZE); max_grow = stack_entry->start - end; } else { KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_UP, ("foo")); KASSERT(addr >= stack_entry->end, ("foo")); end = (next_entry != &map->header) ? next_entry->start : stack_entry->end + stack_entry->avail_ssize; grow_amount = roundup(addr + 1 - stack_entry->end, PAGE_SIZE); max_grow = end - stack_entry->end; } if (grow_amount > stack_entry->avail_ssize) { vm_map_unlock_read(map); return (KERN_NO_SPACE); } /* * If there is no longer enough space between the entries nogo, and * adjust the available space. Note: this should only happen if the * user has mapped into the stack area after the stack was created, * and is probably an error. * * This also effectively destroys any guard page the user might have * intended by limiting the stack size. */ if (grow_amount > max_grow) { if (vm_map_lock_upgrade(map)) goto Retry; stack_entry->avail_ssize = max_grow; vm_map_unlock(map); return (KERN_NO_SPACE); } is_procstack = (addr >= (vm_offset_t)vm->vm_maxsaddr) ? 1 : 0; /* * If this is the main process stack, see if we're over the stack * limit. */ if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) { vm_map_unlock_read(map); return (KERN_NO_SPACE); } /* Round up the grow amount modulo SGROWSIZ */ grow_amount = roundup (grow_amount, sgrowsiz); if (grow_amount > stack_entry->avail_ssize) grow_amount = stack_entry->avail_ssize; if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) { grow_amount = stacklim - ctob(vm->vm_ssize); } /* If we would blow our VMEM resource limit, no go */ if (map->size + grow_amount > vmemlim) { vm_map_unlock_read(map); return (KERN_NO_SPACE); } if (vm_map_lock_upgrade(map)) goto Retry; if (stack_entry == next_entry) { /* * Growing downward. */ /* Get the preliminary new entry start value */ addr = stack_entry->start - grow_amount; /* * If this puts us into the previous entry, cut back our * growth to the available space. Also, see the note above. */ if (addr < end) { stack_entry->avail_ssize = max_grow; addr = end; } rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start, p->p_sysent->sv_stackprot, VM_PROT_ALL, 0); /* Adjust the available stack space by the amount we grew. */ if (rv == KERN_SUCCESS) { if (prev_entry != &map->header) vm_map_clip_end(map, prev_entry, addr); new_entry = prev_entry->next; KASSERT(new_entry == stack_entry->prev, ("foo")); KASSERT(new_entry->end == stack_entry->start, ("foo")); KASSERT(new_entry->start == addr, ("foo")); grow_amount = new_entry->end - new_entry->start; new_entry->avail_ssize = stack_entry->avail_ssize - grow_amount; stack_entry->eflags &= ~MAP_ENTRY_GROWS_DOWN; new_entry->eflags |= MAP_ENTRY_GROWS_DOWN; } } else { /* * Growing upward. */ addr = stack_entry->end + grow_amount; /* * If this puts us into the next entry, cut back our growth * to the available space. Also, see the note above. */ if (addr > end) { stack_entry->avail_ssize = end - stack_entry->end; addr = end; } grow_amount = addr - stack_entry->end; /* Grow the underlying object if applicable. */ if (stack_entry->object.vm_object == NULL || vm_object_coalesce(stack_entry->object.vm_object, stack_entry->offset, (vm_size_t)(stack_entry->end - stack_entry->start), (vm_size_t)grow_amount)) { map->size += (addr - stack_entry->end); /* Update the current entry. */ stack_entry->end = addr; stack_entry->avail_ssize -= grow_amount; vm_map_entry_resize_free(map, stack_entry); rv = KERN_SUCCESS; if (next_entry != &map->header) vm_map_clip_start(map, next_entry, addr); } else rv = KERN_FAILURE; } if (rv == KERN_SUCCESS && is_procstack) vm->vm_ssize += btoc(grow_amount); vm_map_unlock(map); /* * Heed the MAP_WIREFUTURE flag if it was set for this process. */ if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE)) { vm_map_wire(map, (stack_entry == next_entry) ? addr : addr - grow_amount, (stack_entry == next_entry) ? stack_entry->start : addr, (p->p_flag & P_SYSTEM) ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); } return (rv); } /* * Unshare the specified VM space for exec. If other processes are * mapped to it, then create a new one. The new vmspace is null. */ -void +int vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser) { struct vmspace *oldvmspace = p->p_vmspace; struct vmspace *newvmspace; newvmspace = vmspace_alloc(minuser, maxuser); + if (newvmspace == NULL) + return (ENOMEM); newvmspace->vm_swrss = oldvmspace->vm_swrss; /* * This code is written like this for prototype purposes. The * goal is to avoid running down the vmspace here, but let the * other process's that are still using the vmspace to finally * run it down. Even though there is little or no chance of blocking * here, it is a good idea to keep this form for future mods. */ PROC_VMSPACE_LOCK(p); p->p_vmspace = newvmspace; PROC_VMSPACE_UNLOCK(p); if (p == curthread->td_proc) /* XXXKSE ? */ pmap_activate(curthread); vmspace_free(oldvmspace); + return (0); } /* * Unshare the specified VM space for forcing COW. This * is called by rfork, for the (RFMEM|RFPROC) == 0 case. */ -void +int vmspace_unshare(struct proc *p) { struct vmspace *oldvmspace = p->p_vmspace; struct vmspace *newvmspace; if (oldvmspace->vm_refcnt == 1) - return; + return (0); newvmspace = vmspace_fork(oldvmspace); + if (newvmspace == NULL) + return (ENOMEM); PROC_VMSPACE_LOCK(p); p->p_vmspace = newvmspace; PROC_VMSPACE_UNLOCK(p); if (p == curthread->td_proc) /* XXXKSE ? */ pmap_activate(curthread); vmspace_free(oldvmspace); + return (0); } /* * vm_map_lookup: * * Finds the VM object, offset, and * protection for a given virtual address in the * specified map, assuming a page fault of the * type specified. * * Leaves the map in question locked for read; return * values are guaranteed until a vm_map_lookup_done * call is performed. Note that the map argument * is in/out; the returned map must be used in * the call to vm_map_lookup_done. * * A handle (out_entry) is returned for use in * vm_map_lookup_done, to make that fast. * * If a lookup is requested with "write protection" * specified, the map may be changed to perform virtual * copying operations, although the data referenced will * remain the same. */ int vm_map_lookup(vm_map_t *var_map, /* IN/OUT */ vm_offset_t vaddr, vm_prot_t fault_typea, vm_map_entry_t *out_entry, /* OUT */ vm_object_t *object, /* OUT */ vm_pindex_t *pindex, /* OUT */ vm_prot_t *out_prot, /* OUT */ boolean_t *wired) /* OUT */ { vm_map_entry_t entry; vm_map_t map = *var_map; vm_prot_t prot; vm_prot_t fault_type = fault_typea; RetryLookup:; /* * Lookup the faulting address. */ vm_map_lock_read(map); #define RETURN(why) \ { \ vm_map_unlock_read(map); \ return (why); \ } /* * If the map has an interesting hint, try it before calling full * blown lookup routine. */ entry = map->root; *out_entry = entry; if (entry == NULL || (vaddr < entry->start) || (vaddr >= entry->end)) { /* * Entry was either not a valid hint, or the vaddr was not * contained in the entry, so do a full lookup. */ if (!vm_map_lookup_entry(map, vaddr, out_entry)) RETURN(KERN_INVALID_ADDRESS); entry = *out_entry; } /* * Handle submaps. */ if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_t old_map = map; *var_map = map = entry->object.sub_map; vm_map_unlock_read(old_map); goto RetryLookup; } /* * Check whether this task is allowed to have this page. * Note the special case for MAP_ENTRY_COW * pages with an override. This is to implement a forced * COW for debuggers. */ if (fault_type & VM_PROT_OVERRIDE_WRITE) prot = entry->max_protection; else prot = entry->protection; fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); if ((fault_type & prot) != fault_type) { RETURN(KERN_PROTECTION_FAILURE); } if ((entry->eflags & MAP_ENTRY_USER_WIRED) && (entry->eflags & MAP_ENTRY_COW) && (fault_type & VM_PROT_WRITE) && (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) { RETURN(KERN_PROTECTION_FAILURE); } /* * If this page is not pageable, we have to get it for all possible * accesses. */ *wired = (entry->wired_count != 0); if (*wired) prot = fault_type = entry->protection; /* * If the entry was copy-on-write, we either ... */ if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { /* * If we want to write the page, we may as well handle that * now since we've got the map locked. * * If we don't need to write the page, we just demote the * permissions allowed. */ if (fault_type & VM_PROT_WRITE) { /* * Make a new object, and place it in the object * chain. Note that no new references have appeared * -- one just moved from the map to the new * object. */ if (vm_map_lock_upgrade(map)) goto RetryLookup; vm_object_shadow( &entry->object.vm_object, &entry->offset, atop(entry->end - entry->start)); entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; vm_map_lock_downgrade(map); } else { /* * We're attempting to read a copy-on-write page -- * don't allow writes. */ prot &= ~VM_PROT_WRITE; } } /* * Create an object if necessary. */ if (entry->object.vm_object == NULL && !map->system_map) { if (vm_map_lock_upgrade(map)) goto RetryLookup; entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT, atop(entry->end - entry->start)); entry->offset = 0; vm_map_lock_downgrade(map); } /* * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); *object = entry->object.vm_object; *out_prot = prot; return (KERN_SUCCESS); #undef RETURN } /* * vm_map_lookup_locked: * * Lookup the faulting address. A version of vm_map_lookup that returns * KERN_FAILURE instead of blocking on map lock or memory allocation. */ int vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */ vm_offset_t vaddr, vm_prot_t fault_typea, vm_map_entry_t *out_entry, /* OUT */ vm_object_t *object, /* OUT */ vm_pindex_t *pindex, /* OUT */ vm_prot_t *out_prot, /* OUT */ boolean_t *wired) /* OUT */ { vm_map_entry_t entry; vm_map_t map = *var_map; vm_prot_t prot; vm_prot_t fault_type = fault_typea; /* * If the map has an interesting hint, try it before calling full * blown lookup routine. */ entry = map->root; *out_entry = entry; if (entry == NULL || (vaddr < entry->start) || (vaddr >= entry->end)) { /* * Entry was either not a valid hint, or the vaddr was not * contained in the entry, so do a full lookup. */ if (!vm_map_lookup_entry(map, vaddr, out_entry)) return (KERN_INVALID_ADDRESS); entry = *out_entry; } /* * Fail if the entry refers to a submap. */ if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) return (KERN_FAILURE); /* * Check whether this task is allowed to have this page. * Note the special case for MAP_ENTRY_COW * pages with an override. This is to implement a forced * COW for debuggers. */ if (fault_type & VM_PROT_OVERRIDE_WRITE) prot = entry->max_protection; else prot = entry->protection; fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; if ((fault_type & prot) != fault_type) return (KERN_PROTECTION_FAILURE); if ((entry->eflags & MAP_ENTRY_USER_WIRED) && (entry->eflags & MAP_ENTRY_COW) && (fault_type & VM_PROT_WRITE) && (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) return (KERN_PROTECTION_FAILURE); /* * If this page is not pageable, we have to get it for all possible * accesses. */ *wired = (entry->wired_count != 0); if (*wired) prot = fault_type = entry->protection; if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { /* * Fail if the entry was copy-on-write for a write fault. */ if (fault_type & VM_PROT_WRITE) return (KERN_FAILURE); /* * We're attempting to read a copy-on-write page -- * don't allow writes. */ prot &= ~VM_PROT_WRITE; } /* * Fail if an object should be created. */ if (entry->object.vm_object == NULL && !map->system_map) return (KERN_FAILURE); /* * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); *object = entry->object.vm_object; *out_prot = prot; return (KERN_SUCCESS); } /* * vm_map_lookup_done: * * Releases locks acquired by a vm_map_lookup * (according to the handle returned by that lookup). */ void vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry) { /* * Unlock the main-level map */ vm_map_unlock_read(map); } #include "opt_ddb.h" #ifdef DDB #include #include /* * vm_map_print: [ debug ] */ DB_SHOW_COMMAND(map, vm_map_print) { static int nlines; /* XXX convert args. */ vm_map_t map = (vm_map_t)addr; boolean_t full = have_addr; vm_map_entry_t entry; db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n", (void *)map, (void *)map->pmap, map->nentries, map->timestamp); nlines++; if (!full && db_indent) return; db_indent += 2; for (entry = map->header.next; entry != &map->header; entry = entry->next) { db_iprintf("map entry %p: start=%p, end=%p\n", (void *)entry, (void *)entry->start, (void *)entry->end); nlines++; { static char *inheritance_name[4] = {"share", "copy", "none", "donate_copy"}; db_iprintf(" prot=%x/%x/%s", entry->protection, entry->max_protection, inheritance_name[(int)(unsigned char)entry->inheritance]); if (entry->wired_count != 0) db_printf(", wired"); } if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { db_printf(", share=%p, offset=0x%jx\n", (void *)entry->object.sub_map, (uintmax_t)entry->offset); nlines++; if ((entry->prev == &map->header) || (entry->prev->object.sub_map != entry->object.sub_map)) { db_indent += 2; vm_map_print((db_expr_t)(intptr_t) entry->object.sub_map, full, 0, (char *)0); db_indent -= 2; } } else { db_printf(", object=%p, offset=0x%jx", (void *)entry->object.vm_object, (uintmax_t)entry->offset); if (entry->eflags & MAP_ENTRY_COW) db_printf(", copy (%s)", (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done"); db_printf("\n"); nlines++; if ((entry->prev == &map->header) || (entry->prev->object.vm_object != entry->object.vm_object)) { db_indent += 2; vm_object_print((db_expr_t)(intptr_t) entry->object.vm_object, full, 0, (char *)0); nlines += 4; db_indent -= 2; } } } db_indent -= 2; if (db_indent == 0) nlines = 0; } DB_SHOW_COMMAND(procvm, procvm) { struct proc *p; if (have_addr) { p = (struct proc *) addr; } else { p = curproc; } db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n", (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map, (void *)vmspace_pmap(p->p_vmspace)); vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL); } #endif /* DDB */